diff --git a/CMakeLists.txt b/CMakeLists.txt
index bdbf5a6ea604400fb5087976df0e1e9c279fd78d..7f1cffd332dfc4f1614ca63ed60f358acf59a74b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,22 @@
-cmake_minimum_required(VERSION 3.0)
-project(paddle-mobile)
-
-# select the platform to build
-option(CPU "armv7 with neon support" ON)
-option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
+cmake_minimum_required(VERSION 3.0.0)
 
-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(DEBUGING "enable debug mode" ON)
-option(USE_EXCEPTION "use std exception" OFF)
+option(USE_EXCEPTION "use std exception" ON)
+option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
 option(LOG_PROFILE "log profile" OFF)
+# select the platform to build
+option(CPU "armv7 with neon" ON)
+option(GPU_MALI "mali gpu" OFF)
+option(GPU_CL "opencl gpu" OFF)
+option(FPGA "fpga" OFF)
+if(FPGA)
+    option(FPGAV1 "fpga v1" ON)
+    option(FPGAV2 "fpga v2" OFF)
+endif()
+
+
+project(paddle-mobile)
 
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -29,10 +36,10 @@ if(DEBUGING)
     message(STATUS "debugging mode")
     add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
-    if(FPGA)
-    else()
-        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-    endif()
+endif()
+
+if(SYMBOL_HIDDEN)
+    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
 endif()
 
 if(USE_EXCEPTION)
@@ -70,7 +77,27 @@ else()
     endforeach()
 endif()
 
-if(MALI_GPU)
+if (GPU_CL)
+    add_definitions(-DPADDLE_MOBILE_CL)
+
+    # opencl version
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
+
+    link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
+    include_directories(third_party/opencl/OpenCL-Headers)
+else()
+    file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
+
+if (GPU_MALI)
     add_definitions(-DPADDLE_MOBILE_MALI_GPU)
     add_definitions(-DUSE_ACL=1)
     add_definitions(-DUSE_OPENCL)
@@ -96,8 +123,43 @@ else()
 endif()
 
 if(FPGA)
-    message("FPGA mode enabled")
     add_definitions(-DPADDLE_MOBILE_FPGA)
+    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
+    if(FPGAV1)
+        message("FPGA_V1 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+    if(FPGAV2)
+        message("FPGA_V2 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+
 else()
     file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
     foreach(f ${_tmp_list})
@@ -124,17 +186,17 @@ endif()
 if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif()
 
 if(IS_IOS)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
-endif()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
+endif ()
 
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -143,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
 # NET default
-if(FPGA)
-    set(NET "FPGAnets" CACHE STRING "select net type")
+if(FPGAV1)
+    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
+elseif(FPGAV2)
+    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
 else()
     set(NET "default" CACHE STRING "select net type")
 endif()
diff --git a/README.md b/README.md
index ee4e20513186979fe76c1259e7fc3ca962426843..2572f25444dc4268e7a6a3f43cfdc1b38dae8e02 100644
--- a/README.md
+++ b/README.md
@@ -8,46 +8,23 @@
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
 
 
-欢迎来到 Paddle-Mobile GitHub 项目。
-
-Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
-
-## 简单搜索线上效果
-
-如下gif是简单搜索app的线上主体检测应用效果
-
-![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
-
-## Demo目录
-
-[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。
 
 ## Features
 
-- **ARM CPU**
-
-- **Mali GPU**
-
-- **苹果设备的GPU Metal实现**
-
-- **FPGA**
+- 高性能支持ARM CPU 
+- 支持Mali GPU
+- 支持Andreno GPU
+- 支持苹果设备的GPU Metal实现
+- 支持ZU5、ZU9等FPGA开发板
+- 支持树莓派等arm-linux开发板
 
-    目前已经支持 ZCU102 开发板。
+## Demo
+- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
 
-- **灵活性**
-
-    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
-    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
-    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
-    * 使用 docker 编译, 提供统一的编译环境。
-    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
-    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
-
-- **体积**
-
-    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
-    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+### 原Domo目录
 
+[https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
 
 ## 文档
 
@@ -62,6 +39,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
 * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
 * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
 
 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
@@ -73,18 +51,22 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 1. 直接使用Paddle Fluid训练
 该方式最为可靠，推荐方式
 ### 2. caffe转为Paddle Fluid模型
-[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
 ### 3. ONNX
 ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
 
 除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
 
-目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
-
-![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
+目前，百度也在做onnx支持工作。相关转换项目在这里：
+[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
 
 ### 4. 部分测试模型和测试图片下载
-[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+
+<!--## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->
 
 ## 问题解决
 
@@ -96,5 +78,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L
 
 ## 旧版 Mobile-Deep-Learning
 原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
-
-
diff --git a/demo/ReadMe.md b/demo/ReadMe.md
index aa71f75cb7526234bb0bb32e2e5e1f93c1789711..c6d7b3def9fb44db86ea4456396c91354953d99d 100644
--- a/demo/ReadMe.md
+++ b/demo/ReadMe.md
@@ -1,11 +1,10 @@
-## 如何运行demo
-- Android demo下载路径   
- http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
-- iOS demo下载路径：   
-  http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
-  
-在demo目录下执行下载demo的脚本
+## Demo 下载路径
+- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
+
+- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)
+
+- 原demo亦可使用getDemo.sh进行下载
+
 ```
 sh getDemo.sh
 ```
-demo工程就下载解压到当前目录中了。
\ No newline at end of file
diff --git a/doc/development_android_GPU.md b/doc/development_android_GPU.md
new file mode 100644
index 0000000000000000000000000000000000000000..03750260cf343692e52fd667cb797e27e7b6983d
--- /dev/null
+++ b/doc/development_android_GPU.md
@@ -0,0 +1,85 @@
+## paddle-mobile GPU开发文档
+
+编译环境配置方法请参考development_android.md文档
+
+1. 下载 paddle-mobile
+
+```
+git clone https://github.com/PaddlePaddle/paddle-mobile.git
+
+adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl
+
+修改paddle-mobile/CMakeLists.txt文件，执行如下操作:
+option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON)
+
+cd paddle-mobile/tools
+
+sh build.sh android
+
+```
+2. 将单测可执行文件和模型部署到手机
+
+下载测试需要的mobilenet和test_image_1x3x224x224_float文件，下载地址：http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip
+
+```
+cd ../test
+mkdir models
+mkdir images
+
+```
+将mobilenet复制到paddle-mobile/test/models目录下
+将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下
+
+执行下面命令将可执行文件和预测需要的文件部署到手机
+
+```
+cd ../tools/android-debug-script
+sh push2android.sh
+
+```
+3. 在adb shell中执行对应的可执行文件（目前只支持mobilenet，后续会支持更多的网络模型）
+
+```
+adb shell
+cd /data/local/tmp/bin/
+export LD_LIBRARY_PATH=.
+./test-mobilenetgpu
+
+```
+4. mobilenet cpu模型预测结果
+
+假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上，执行下面命令进行mobilenet cpu的预测
+
+```
+adb shell
+cd /data/local/tmp/bin/
+export LD_LIBRARY_PATH=.
+./test-mobilenet
+
+```
+5. 预测结果
+
+  手机型号：小米6(CPU 835,GPU Adreno 540)
+
+  mobilenet gpu：预测性能，耗时41ms左右。
+
+  mobilenet cpu:
+
+  1线程：108ms
+  2线程：65ms
+  4线程：38ms
+
+  手机型号：OPPO Findx(CPU 845,GPU Adreno 630)
+
+  mobilenet gpu：预测性能，耗时27ms左右。
+
+  mobilenet cpu:
+
+  1线程：90ms
+  2线程：50ms
+  4线程：29ms
+
+
+
+
+
diff --git a/doc/development_arm_linux.md b/doc/development_arm_linux.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7874179480ec579c8c7e5b46cd0f1905fb4f9c43 100644
--- a/doc/development_arm_linux.md
+++ b/doc/development_arm_linux.md
@@ -0,0 +1,28 @@
+# ARM_LINUX开发文档
+目前支持直接在arm_linux平台上编译paddle-mobile
+
+## 以Raspberrypi3为例：
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+```
+cd tools
+/bin/bash build.sh arm_linux googlenet
+```
+执行完毕后，生成的so位于paddle-mobile/build/release/arm-linux/build目录中，单测可执行文件位于test/build目录中。
+
+### 运行
+```
+cd ../build/release/arm-linux/build
+export LD_LIBRARY_PATH=.
+cd ../../../../test/build/
+./test-googlenet
+```
+*注1：如果本地test目录下没有模型的话，会自动下载官方demo模型并解压.*
+
+*注2：因为arm_linux设备算力限制,建议编译时,根据需要指定编译某个模型（如googlenet）或扩大系统的swap交换空间，避免编译时卡死.*
+
+## 其他ARM_LINUX平台
+
+其他的arm_linux平台可以修改 tools/build.sh中的相关编译参数进行编译。可以参考对应平台的编译选项。
+特别说明的是Android平台请参考Android开发文档.
+
diff --git a/doc/development_fpga.md b/doc/development_fpga.md
index 3389ddde676a5d1c7b452dc734880eb50170bd3e..1f0d6ffb364fc35cda306ad748c45c085d5986d6 100644
--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
@@ -1,6 +1,6 @@
 # FPGA开发文档
 
-FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。以下描述适用于复现V1运行的结果。
 
 ## 准备硬件
 ___
@@ -17,7 +17,7 @@ ___
 ## 编译工程
 ___
 1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
-2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。
 2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
     * mkdir build
     * cd build
diff --git a/src/common/common.h b/src/common/common.h
index 12157b5e946490d041f0cc0d235142a13a3a2527..c7a681f426f788bcd8ee8f52dbfab3c6e1afeb8f 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <chrono>
+#include <chrono>  // NOLINT
+
+namespace paddle_mobile {
 
 using Time = decltype(std::chrono::high_resolution_clock::now());
 
@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
   ms counter = std::chrono::duration_cast<ms>(diff);
   return counter.count() / 1000.0;
 }
+
+}  // namespace paddle_mobile
diff --git a/src/common/enforce.h b/src/common/enforce.h
index aebe2a58031cb1341596f07dbf653be4a5e01900..bf21b5b9a2fe5f70b3bd23a581f0c1dfbf373f42 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception {
     std::string detail(buffer);                                            \
     throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
                                                __FILE__, __LINE__);        \
-  }
+  }                                                                        \
+  exit(0);
 
 #define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
   {                                                                           \
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 8c8de7765161e61dc75036a87a34fc6abd2df43e..510313d9fee0940d7162ea2c6b09426f6d9ce17a 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -40,9 +40,11 @@ const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
 const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_RESHAPE2 = "reshape2";
 const char *G_OP_TYPE_SIGMOID = "sigmoid";
 const char *G_OP_TYPE_SOFTMAX = "softmax";
 const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
 const char *G_OP_TYPE_SPLIT = "split";
 const char *G_OP_TYPE_FEED = "feed";
 const char *G_OP_TYPE_FETCH = "fetch";
@@ -69,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";
 
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+extern const char *G_OP_TYPE_TANH = "tanh";
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -80,6 +84,7 @@ std::unordered_map<
         {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
         {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
         {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
@@ -90,6 +95,7 @@ std::unordered_map<
         {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
         {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
         {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
         {G_OP_TYPE_BOX_CODER,
          {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
@@ -99,6 +105,7 @@ std::unordered_map<
         {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
@@ -124,5 +131,7 @@ std::unordered_map<
         {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
         {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 0855bd053f0dc804b6f3289796f3818657675864..4cd35ac91084f6518858c97cf4c0e8da5b09555b 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -39,7 +39,13 @@ struct PrecisionTrait<Precision::FP16> {
 };
 
 //! device type
-enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+enum DeviceTypeEnum {
+  kINVALID = -1,
+  kCPU = 0,
+  kFPGA = 1,
+  kGPU_MALI = 2,
+  kGPU_CL = 3
+};
 
 template <DeviceTypeEnum T>
 struct DeviceType {};
@@ -47,6 +53,7 @@ struct DeviceType {};
 typedef DeviceType<kCPU> CPU;
 typedef DeviceType<kFPGA> FPGA;
 typedef DeviceType<kGPU_MALI> GPU_MALI;
+typedef DeviceType<kGPU_CL> GPU_CL;
 
 //! data type
 enum DataType {
@@ -132,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
 
+extern const char *G_OP_TYPE_TANH;
+extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
+
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key;
diff --git a/src/fpga/api.cpp b/src/fpga/V1/api.cpp
similarity index 99%
rename from src/fpga/api.cpp
rename to src/fpga/V1/api.cpp
index d3f473a7f43714592779de941ed1a6ea53baea83..04e51ab9b09fabc41fcd1cd73864bc285d183821 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "fpga/bias_scale.h"
-#include "fpga/filter.h"
-#include "fpga/image.h"
+#include "fpga/V1/bias_scale.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX
 
diff --git a/src/fpga/api.h b/src/fpga/V1/api.h
similarity index 100%
rename from src/fpga/api.h
rename to src/fpga/V1/api.h
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp
similarity index 98%
rename from src/fpga/bias_scale.cpp
rename to src/fpga/V1/bias_scale.cpp
index 50f1ed03f0121b5afdc41d427e5b52675994bd1e..3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/V1/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/bias_scale.h"
+#include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/bias_scale.h b/src/fpga/V1/bias_scale.h
similarity index 100%
rename from src/fpga/bias_scale.h
rename to src/fpga/V1/bias_scale.h
diff --git a/src/fpga/filter.cpp b/src/fpga/V1/filter.cpp
similarity index 99%
rename from src/fpga/filter.cpp
rename to src/fpga/V1/filter.cpp
index db851b926bbbd549205ee5d75bc46a6c04888098..3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/filter.h"
+#include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/filter.h b/src/fpga/V1/filter.h
similarity index 100%
rename from src/fpga/filter.h
rename to src/fpga/V1/filter.h
diff --git a/src/fpga/image.cpp b/src/fpga/V1/image.cpp
similarity index 98%
rename from src/fpga/image.cpp
rename to src/fpga/V1/image.cpp
index dac6e2a633155e593550ede4d738c5606cec3283..73be05c942d6a848db830148d25bc8b3e14b53e4 100644
--- a/src/fpga/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fpga/image.h"
+#include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
diff --git a/src/fpga/image.h b/src/fpga/V1/image.h
similarity index 100%
rename from src/fpga/image.h
rename to src/fpga/V1/image.h
diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f8a9f119e643b3836ef2c541e098f39ab3cbd17
--- /dev/null
+++ b/src/fpga/V2/api.cpp
@@ -0,0 +1,295 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/api.h"
+#include <algorithm>
+#include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = driver::open_device_driver();
+  return ret;
+}
+
+int close_device() {
+  int ret = driver::close_device_driver();
+  return ret;
+}
+
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = driver::fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    driver::fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+void fpga_copy(void *dest, const void *src, size_t num) {
+#ifdef PADDLE_MOBILE_ZU5
+  driver::fpga_copy_driver(dest, src, num);
+#else
+  memcpy(dest, src, num);
+#endif
+}
+
+int fpga_flush(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_flush_driver(address, size);
+#else
+  return 0;
+#endif
+}
+int fpga_invalidate(void *address, size_t size) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::fpga_invalidate_driver(address, size);
+#else
+  return 0;
+#endif
+}
+
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  auto channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
+  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
+                      (int)width,                                    // NOLINT
+                      aligned_channel);
+  image_tensor->reset_data_ptr(new_data);
+}
+
+void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = height * width * aligned_channel * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+float filter_find_max(framework::Tensor *filter_tensor) {
+  auto filter_ptr = filter_tensor->data<float>();
+  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
+}
+
+int get_aligned_channel_num(int channel_num) {
+  return filter::calc_aligned_channel(channel_num);
+}
+
+int get_aligned_filter_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
+}
+
+int get_conv_output_channel(framework::Tensor *filter_tensor) {
+  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
+  return get_aligned_channel_num(aligned_filter_num);
+}
+void format_filter(framework::Tensor *filter_tensor, float max_value,
+                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                        (int)height,                        // NOLINT
+                        (int)width, group_num, max_value);  // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                           (int)height,                        // NOLINT
+                           (int)width, 1, max_value);          // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_bias_scale_array(float **bias_scale_array, int filter_num,
+                             int filter_channel) {
+  int num_after_alignment =
+      filter::calc_aligned_num(filter_channel, filter_channel);
+  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
+                                      num_after_alignment);
+}
+
+void format_concat_output(framework::Tensor *out, int height, int width,
+                          uint32_t out_channel) {
+  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
+  auto ddim = framework::make_ddim({1, out_channel, height, width});
+  out->Resize(ddim);
+  out->reset_data_ptr(data_ptr);
+}
+
+int format_conv_data(framework::Tensor *filter_tensor,
+                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_filter(filter_tensor, max_value, group);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+int format_fc_data(framework::Tensor *filter_tensor,
+                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_fc_filter(filter_tensor, max_value);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+
+  arg->group_num = (uint32_t)group_num;
+  arg->split_num = 1;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  arg->conv_arg =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
+
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale;
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];
+
+  int n = arg->split_num;
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+
+  for (int i = 0; i < n; i++) {
+    arg->conv_arg[i].relu_enabled = relu_enabled;
+    arg->conv_arg[i].sb_address = bs_ptr;
+    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_arg[i].filter_scale_address = filter->scale;
+    arg->conv_arg[i].filter_num = arg->filter_num;
+    arg->conv_arg[i].group_num = (uint32_t)group_num;
+
+    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
+
+    arg->conv_arg[i].image.address = input_ptr;
+    arg->conv_arg[i].image.scale_address = input->scale;
+    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
+
+    arg->conv_arg[i].output.address = out_ptr;
+    arg->conv_arg[i].output.scale_address = out->scale;
+
+    int num_after_alignment =
+        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
+    arg->conv_arg[i].free_space =
+        fpga_malloc(num_after_alignment * 2 * sizeof(half));
+  }
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f4a203936b517d93e2d417b08a8b8456cc1fc93
--- /dev/null
+++ b/src/fpga/V2/api.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/fpga_common.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dest, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
+float filter_find_max(framework::Tensor* filter_tensor);
+int get_aligned_channel_num(int channel_num);
+int get_aligned_filter_num(framework::Tensor* filter_tensor);
+int get_conv_output_channel(framework::Tensor* filter_tensor);
+
+void format_image(framework::Tensor* image_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor,
+                     int aligned_channel);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+
+void format_filter(framework::Tensor* filter_tensor, float max_value,
+                   int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
+void format_bias_scale_array(float** bias_scale_array, int filter_num,
+                             int filter_channel);
+void format_concat_output(framework::Tensor* out, int height, int width,
+                          uint32_t out_channel);
+int format_conv_data(framework::Tensor* filter_tensor,
+                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+int format_fc_data(framework::Tensor* filter_tensor,
+                   framework::Tensor* ofm_tensor, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3afd3f51bbb10e3bb2d66195fcc54d25c56e2393
--- /dev/null
+++ b/src/fpga/V2/bias_scale.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/bias_scale.h"
+#include <memory.h>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment) {
+  float *ptr_unaligned = *data_in;
+  int total_element = 2 * num_after_alignment;  // including bias & scale
+  float *ptr_aligned =
+      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
+  memset(ptr_aligned, 0, total_element * sizeof(float));
+
+  for (int i = 0; i < num; i++) {
+    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
+    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  }
+
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+
+void format_bias_scale_array(float **data_in, int num,
+                             int num_after_alignment) {
+  align_element(data_in, num, num_after_alignment);
+  fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
+}
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/bias_scale.h b/src/fpga/V2/bias_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..6040c0bef138631e2d1ada280d7a1fc593915e36
--- /dev/null
+++ b/src/fpga/V2/bias_scale.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment);
+void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/config.h b/src/fpga/V2/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..27187c7b854c84d501949db41fe89f9dca1d2bf1
--- /dev/null
+++ b/src/fpga/V2/config.h
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define PADDLE_MOBILE_ZU5
+#define FPGA_PRINT_MODE
diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c612faa6aed11b683ff81fffdf6c57a6fed9536d
--- /dev/null
+++ b/src/fpga/V2/driver/bitmap.cpp
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/driver/bitmap.h"
+
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+  while (len - bits_to_set >= 0) {
+    *p |= mask_to_set;
+    len -= bits_to_set;
+    bits_to_set = BITS_PER_LONG;
+    mask_to_set = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+    *p |= mask_to_set;
+  }
+}
+
+void bitmap_clear(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+  while (len - bits_to_clear >= 0) {
+    *p &= ~mask_to_clear;
+    len -= bits_to_clear;
+    bits_to_clear = BITS_PER_LONG;
+    mask_to_clear = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+    *p &= ~mask_to_clear;
+  }
+}
+
+static uint64_t ffs(uint64_t data) {
+  uint64_t bit = 0;
+  int i = 0;
+
+  for (i = 0; i < sizeof(data) * 8; i++) {
+    if (data & (1UL << i)) {
+      bit = i;
+      break;
+    }
+  }
+
+  return bit;
+}
+
+static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
+                               uint64_t start, uint64_t invert) {
+  uint64_t tmp = 0;
+
+  if (!nbits || start >= nbits) return nbits;
+
+  tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+  /* Handle 1st word. */
+  tmp &= BITMAP_FIRST_WORD_MASK(start);
+  start = round_down(start, BITS_PER_LONG);
+
+  while (!tmp) {
+    start += BITS_PER_LONG;
+    if (start >= nbits) return nbits;
+
+    tmp = addr[start / BITS_PER_LONG] ^ invert;
+  }
+
+  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
+}
+
+uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
+                            uint64_t offset) {
+  return _find_next_bit(addr, size, offset, ~0UL);
+}
+
+uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
+  return _find_next_bit(addr, size, offset, 0UL);
+}
+
+uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
+                                        uint64_t start, unsigned int nr,
+                                        uint64_t align_mask,
+                                        uint64_t align_offset) {
+  uint64_t index = 0;
+  uint64_t end = 0;
+  uint64_t i = 0;
+
+again:
+  index = find_next_zero_bit(map, size, start);
+
+  /* Align allocation */
+  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
+
+  end = index + nr;
+  if (end > size) return end;
+  i = find_next_bit(map, end, index);
+  if (i < end) {
+    start = i + 1;
+    goto again;
+  }
+
+  return index;
+}
+
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask) {
+  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
+}
+}  // namespace fpga_bitmap
diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cb1673d91d61c1ec27bbc6923e49e8dd04e3a37
--- /dev/null
+++ b/src/fpga/V2/driver/bitmap.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define BITS_PER_LONG 64
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
+
+#define round_down(x, y) ((x) & ~((y)-1))
+
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len);
+void bitmap_clear(uint64_t *map, unsigned int start, int len);
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask);
+
+}  // namespace fpga_bitmap
diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7e71782676fd350f938847c03e9736ff0adb64a
--- /dev/null
+++ b/src/fpga/V2/driver/driver.cpp
@@ -0,0 +1,432 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+
+#include "common/enforce.h"
+#include "fpga/V2/driver/bitmap.h"
+#include "fpga/V2/driver/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace driver {
+struct FPGA_INFO g_fpgainfo;
+
+int open_drvdevice() {
+  if (g_fpgainfo.fd_drv == -1) {
+    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
+  }
+  return g_fpgainfo.fd_drv;
+}
+
+int open_memdevice() {
+  if (g_fpgainfo.fd_mem == -1) {
+    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
+  }
+  return g_fpgainfo.fd_mem;
+}
+
+void pl_reset() {
+  // DLOG << "PL RESET";
+
+  usleep(100 * 1000);
+}
+
+void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
+              char const *type_name, int pe_idx) {
+  memset(pe, 0, sizeof(struct fpga_pe));
+
+  pe->outer = pe_data;
+  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
+
+  pe->status = IDLE;
+  pe->interrupt_cnt = 0;
+  pe_data->pes[pe_idx] = pe;
+  pe_data->pe_num++;
+}
+
+void pl_init() {
+  struct pe_data_s *pe_data = nullptr;
+
+  pl_reset();
+
+  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
+  if (pe_data == nullptr) {
+    DLOG << "pe_data malloc error!";
+    return;
+  }
+  memset(pe_data, 0, sizeof(struct pe_data_s));
+  pthread_mutex_init(&pe_data->mutex, 0);
+
+  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
+  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
+  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
+  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
+
+  g_fpgainfo.pe_data = pe_data;
+}
+
+void pl_destroy() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_destroy(&pe_data->mutex);
+  free(pe_data);
+}
+
+void pl_start() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+
+  pthread_mutex_unlock(&pe_data->mutex);
+}
+
+void pl_stop() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+
+  pthread_mutex_lock(&pe_data->mutex);
+}
+
+void pl_reinit() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  struct fpga_pe *pe = nullptr;
+  int i = 0;
+
+  pl_stop();
+  pl_reset();
+  pl_start();
+
+  for (i = 0; i < pe_data->pe_num; i++) {
+    pe = pe_data->pes[i];
+    pe->status = IDLE;
+    pe->interrupt_cnt = 0;
+  }
+
+  pl_start();
+}
+
+int pl_get_status() { return 0; }
+
+/*tmie单位us*/
+int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
+  uint64_t i = 0;
+  /*timeout精确性待确认*/
+  int64_t timeout = time * 6;
+
+  for (i = 0; i < timeout; i++) {
+    if (val == reg_readq(reg)) {
+      break;
+    }
+  }
+
+  if (i <= timeout) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+/*内存管理*/
+int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
+  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
+  unsigned int nr = (unsigned int)_nr;
+  int ret = 0;
+
+  pthread_mutex_lock(&memory->mutex);
+
+  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
+      memory->bitmap, memory->page_num, 0, nr, 0);
+  if (pos <= memory->page_num) {
+    uint64_t address_ofset =
+        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
+    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
+    memory->nr[pos] = nr;
+
+    *addr = address_ofset;
+  } else {
+    ret = -ENOMEM;
+  }
+
+  pthread_mutex_unlock(&memory->mutex);
+
+  return ret;
+}
+
+void memory_release(struct fpga_memory *memory) {
+  void *ptr = nullptr;
+
+  /*unmap memory*/
+  std::map<void *, size_t> map = g_fpgainfo.fpga_addr2size_map;
+  std::map<void *, size_t>::iterator iter;
+  for (iter = map.begin(); iter != map.end(); iter++) {
+    fpga_free_driver(ptr);
+  }
+}
+
+int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
+  int rc = 0;
+
+  uint64_t *bitmap = nullptr;
+  unsigned int *nr = nullptr;
+
+  // 不允许多份memory创建，所以创建memory结构体不存在互斥
+  // pthread_mutex_lock(&memory->mutex);
+  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
+  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
+
+  bitmap =
+      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
+  if (!bitmap) {
+    rc = -EFAULT;
+    return rc;
+  }
+  memory->bitmap = bitmap;
+
+  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
+  if (!nr) {
+    rc = -EFAULT;
+    free(bitmap);
+    return rc;
+  }
+  memory->nr = nr;
+
+  memory->mem_start = FPGA_MEM_PHY_ADDR;
+  memory->mem_end = FPGA_MEM_SIZE;
+  // pthread_mutex_unlock(memory->mutex);
+
+  return rc;
+}
+
+int create_fpga_memory(struct fpga_memory **memory_info) {
+  int rc = 0;
+
+  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
+  if (*memory_info == NULL) {
+    rc = -EFAULT;
+    return rc;
+  }
+  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
+
+  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
+  if (rc) {
+    free(*memory_info);
+  }
+
+  return rc;
+}
+
+int init_fpga_memory(struct fpga_memory *memory) {
+  int rc = 0;
+
+  if (!memory) {
+    rc = -EFAULT;
+    return rc;
+  }
+
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
+
+  return 0;
+}
+
+void destroy_fpga_memory(struct fpga_memory *memory) {
+  if (memory) {
+    free(memory->nr);
+    free(memory->bitmap);
+    free(memory);
+  }
+}
+
+int fpga_memory_add() {
+  int rc = 0;
+
+  rc = create_fpga_memory(&g_fpgainfo.memory_info);
+  if (rc) {
+    return rc;
+  }
+
+  rc = init_fpga_memory(g_fpgainfo.memory_info);
+  if (rc) {
+    destroy_fpga_memory(g_fpgainfo.memory_info);
+    return rc;
+  }
+
+  return 0;
+}
+
+uint64_t vaddr_to_paddr(void *address) {
+  uint64_t paddr = 0;
+  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
+  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
+    paddr = iter->second;
+  } else {
+    DLOG << "Invalid pointer";
+  }
+
+  return paddr;
+}
+
+void *fpga_reg_malloc(size_t size) {
+  void *ret = nullptr;
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
+  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+
+  return ret;
+}
+
+void *fpga_reg_free(void *ptr) {
+  size_t size = 0;
+
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+void *fpga_malloc_driver(size_t size) {
+  void *ret = nullptr;
+  uint64_t phy_addr = 0;
+  int i = 0;
+
+  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_mem, phy_addr);
+  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+
+  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+
+  return ret;
+}
+
+void fpga_free_driver(void *ptr) {
+  size_t size = 0;
+  uint32_t pos = 0;
+  uint64_t p_addr = 0;
+
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+
+    p_addr = vaddr_to_paddr(ptr);
+    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
+
+    /*clear bitmap*/
+    pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
+    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
+                              g_fpgainfo.memory_info->nr[pos]);
+    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+static inline int do_ioctl(unsigned long req, const void *arg) {
+  return ioctl(g_fpgainfo.fd_mem, req, arg);
+}
+
+int fpga_flush_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate_driver(void *address, size_t size) {
+  struct MemoryCacheArgs args;
+  uint64_t p_addr;
+
+  p_addr = vaddr_to_paddr(address);
+
+  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
+  args.size = size;
+
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+void fpga_copy_driver(void *dest, const void *src, size_t num) {
+  uint64_t i;
+
+  DLOG << "dest:" << dest << " src:" << src << " size:" << num;
+
+  for (i = 0; i < num; i++) {
+    // DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
+    // usleep(1);
+    *((int8_t *)dest + i) = *((int8_t *)src + i);
+  }
+
+  return;
+}
+
+int open_device_driver() {
+  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
+  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
+  g_fpgainfo.FpgaRegVirAddr = nullptr;
+  g_fpgainfo.pe_data = nullptr;
+  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
+  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
+  g_fpgainfo.fd_drv = -1;
+  g_fpgainfo.fd_mem = -1;
+
+  int ret = 0;
+  ret = open_drvdevice();
+  ret |= open_memdevice();
+
+  g_fpgainfo.FpgaRegVirAddr =
+      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
+  fpga_memory_add();
+
+  pl_init();
+
+  return ret;
+}
+
+int close_device_driver() {
+  pl_destroy();
+  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
+  memory_release(g_fpgainfo.memory_info);
+  destroy_fpga_memory(g_fpgainfo.memory_info);
+
+  return 0;
+}
+
+}  // namespace driver
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..633e95ea8204ada2a330a6bb4fab4ce8fe23248b
--- /dev/null
+++ b/src/fpga/V2/driver/driver.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>
+#include <map>
+
+#include "common/log.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace driver {
+
+#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
+
+#define FPGA_REG_PHY_ADDR 0xa0000000
+#define FPGA_REG_SIZE 0x1000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
+
+#define CPU_FREQ 1000000000
+
+#define FPGA_PAGE_SIZE (16UL * 1024UL)
+
+// PE related macros
+const int MAX_NUM_PES = 6;
+const size_t MAX_TYPE_NAME_LENTH = 8;
+
+const int PE_IDX_CONV = 0;
+const int PE_IDX_POOLING = 1;
+const int PE_IDX_EW = 2;
+const int PE_IDX_BYPASS = 3;
+
+enum pe_status { IDLE = 0, BUSY = 1 };
+
+struct MemoryCacheArgs {
+  void *offset;
+  size_t size;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+
+struct fpga_pe {
+  char type_name[MAX_TYPE_NAME_LENTH + 1];
+  struct pe_data_s *outer;
+  pe_status status;  // 0=idle 1=busy -1=fail
+  uint64_t interrupt_cnt;
+};
+
+struct pe_data_s {
+  pthread_mutex_t mutex;
+  struct fpga_pe pe_conv;
+  struct fpga_pe pe_pooling;
+  struct fpga_pe pe_ew;
+  struct fpga_pe pe_bypass;
+
+  struct fpga_pe *pes[MAX_NUM_PES];
+  int pe_num;
+};
+
+struct fpga_memory {
+  pthread_mutex_t mutex;
+  uint64_t *bitmap;
+  unsigned int *nr;
+  unsigned int page_num;
+  unsigned int page_num_long;
+  uint64_t mem_start;
+  uint64_t mem_end;
+};
+
+struct FPGA_INFO {
+  uint64_t FpgaRegPhyAddr;
+  uint64_t FpgaMemPhyAddr;
+  pthread_t poll_pid;
+  void *FpgaRegVirAddr;
+  struct pe_data_s *pe_data;
+
+  std::map<void *, size_t> fpga_addr2size_map;
+  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
+  const char *drvdevice_path;
+  const char *memdevice_path;
+  struct fpga_memory *memory_info;
+  int fd_drv;
+  int fd_mem;
+};
+
+extern struct FPGA_INFO g_fpgainfo;
+
+inline uint64_t reg_readq(uint32_t offset) {
+  // DLOG << "offset : " << offset;
+  uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                                          offset);  // NOLINT
+
+  return value;
+}
+
+inline void reg_writeq(uint64_t value, uint32_t offset) {
+  // DLOG << "offset : " << offset << ", value : " << value;
+  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
+                         offset) =  // NOLINT
+      value;
+}
+
+int open_device_driver();
+
+int close_device_driver();
+
+void *fpga_malloc_driver(size_t size);
+
+void fpga_free_driver(void *ptr);
+
+void fpga_copy_driver(void *dest, const void *src, size_t num);
+
+int fpga_flush_driver(void *address, size_t size);
+
+int fpga_invalidate_driver(void *address, size_t size);
+
+/*pe*/
+
+uint64_t vaddr_to_paddr(void *address);
+
+int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+
+}  // namespace driver
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e806bfb37c131fad1c011c960bc79aa1b121186
--- /dev/null
+++ b/src/fpga/V2/driver/pe.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+
+namespace paddle_mobile {
+namespace fpga {
+#define MUL8(x) ((x)*8)
+#define BYPASS_DONE 1
+
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+
+  max_fp16 = driver::reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = driver::reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = driver::reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = driver::reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_arg[0]);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
+
+  // start bypass
+  driver::reg_writeq(ifm_src_paddr, MUL8(27));
+  driver::reg_writeq(ifm_dst_paddr, MUL8(28));
+  driver::reg_writeq(0, MUL8(0));
+  driver::reg_writeq(bp_enable, MUL8(0));
+  // poll
+  int ret = -1;
+  ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
+    driver::reg_readq(MUL8(63));
+  }
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
+  return ret;
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width, args.aligned_channel_num,
+                       args.out_channel);
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h
new file mode 100644
index 0000000000000000000000000000000000000000..4903bf4c33f6b5d5899c56eeaada8c7a21d1a875
--- /dev/null
+++ b/src/fpga/V2/driver/pe.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "fpga/V2/fpga_common.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeBasicConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+
+int ComputeFpgaConv(const struct SplitConvArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce278edbeed64f2ca413c1f75ff620ee1f44c83d
--- /dev/null
+++ b/src/fpga/V2/filter.cpp
@@ -0,0 +1,156 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/filter.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel) {
+  if (channel <= 16) {
+    return 16;
+  } else if (channel <= 32) {
+    return 32;
+  } else if (channel <= 64) {
+    return 64;
+  } else {
+    return 128;
+  }
+}
+int calc_aligned_channel(int channel) {
+  return align_to_x(channel, calc_channel_parallelism(channel));
+}
+
+int calc_num_parallelism(int channel) {
+  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+}
+
+int calc_aligned_num(int num, int channel) {
+  return align_to_x(num, calc_num_parallelism(channel));
+}
+
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
+  int aligned_channel = calc_aligned_channel(channel);
+  int aligned_filter_num = calc_aligned_num(num, channel);
+  return aligned_filter_num * aligned_channel * height * width;
+}
+
+void convert_to_hwc(float **data_in, int num, int channel, int height,
+                    int width) {
+  float *tmp = *data_in;
+  int chw = channel * height * width;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_filter(float **data_in, int num, int channel, int height,
+                  int width) {
+  int aligned_channel = calc_channel_parallelism(channel);
+  int hw = height * width;
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
+  float *temp = *data_in;
+  memset(new_data, 0, pixel_num * sizeof(float));
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < hw; j++) {
+      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
+             temp + i * channel * hw + j * channel, channel * sizeof(float));
+    }
+  }
+  *data_in = new_data;
+  fpga_free(temp);
+}
+
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  convert_to_hwc(data_in, num, channel, height, width);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+}
+
+void convert_fc_filter(float **data_in, int num, int chw) {
+  float *tmp = *data_in;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int chw = channel * height * width;
+  convert_fc_filter(data_in, num, chw);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  fpga_flush(*data_in, pixel_num * sizeof(float));
+}
+
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/filter.h b/src/fpga/V2/filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..08c758bca4a65d232f6dd69eef9c752558b01da0
--- /dev/null
+++ b/src/fpga/V2/filter.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define FILTER_PARALLELISM 1024
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel);
+int calc_aligned_channel(int channel);
+int calc_num_parallelism(int channel);
+int calc_aligned_num(int num, int channel);
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
+void convert_to_hwc(float** data_in, int num, int channel, int height,
+                    int width);
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
+void convert_fc_filter(float** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+float find_max(float* data_in, int data_size);
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01bca30a9ccf79232e1f28bbf77b1c030632f5bc
--- /dev/null
+++ b/src/fpga/V2/fpga_common.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fpga/V2/fpga_common.h>
+namespace paddle_mobile {
+namespace fpga {
+
+int16_t fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(int16_t fp16_num) {
+  if (0 == fp16_num) {
+    return 0;
+  }
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/V2/fpga_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1862d843503ee8faf58caf038202e198ca079905
--- /dev/null
+++ b/src/fpga/V2/fpga_common.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias
+  void* filter_address;
+  float* filter_scale_address;
+  void* free_space;  // used by FPGA logic
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  int16_t** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  int16_t kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct DeconvArgs {
+  struct ConvArgs conv_arg;
+};
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+int16_t fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(int16_t fp16_num);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26829bfba65f2375b27251070b33b2bbe57d069b
--- /dev/null
+++ b/src/fpga/V2/image.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/image.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel) {
+  if (channel == aligned_channel) return;
+  float *tmp = *data_in;
+  float *new_data =
+      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
+                           sizeof(float));                     // NOLINT
+  memset(new_data, 0, aligned_channel * height * width * sizeof(float));
+
+  for (int i = 0; i < height * width; i++) {
+    memcpy(new_data + i * aligned_channel, tmp + i * channel,
+           channel * sizeof(float));
+  }
+  *data_in = new_data;
+  fpga_free(tmp);
+}
+
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_image(data_in, channel, height, width, aligned_channel);
+  fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
+}
+
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, const uint32_t *channel_num,
+                   int height, int width, const uint32_t *aligned_channel_num,
+                   int out_channel) {
+  int hw = height * width;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (int i = 0; i < image_num; i++) {
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height * width * aligned_channel_num[i] * sizeof(int16_t));
+  }
+  scale_out[1] = 1 / scale_out[0];
+
+  for (int j = 0; j < hw; j++) {
+    int tmp_channel_sum = 0;
+    for (int i = 0; i < image_num; i++) {
+      memcpy(
+          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
+          images_in[i] + j * aligned_channel_num[i],
+          channel_num[i] * sizeof(int16_t));
+
+      tmp_channel_sum += channel_num[i];
+    }
+  }
+  fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
+}
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20e583fc64e3544eb24bee7aeaf3652331180c
--- /dev/null
+++ b/src/fpga/V2/image.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width);
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel);
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel);
+void concat_images(
+    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
+    int image_num, const uint32_t *channel_num, int height, int width,
+    const uint32_t *aligned_channel_num,
+    int out_channel);  // Concat featuremaps along channel direction
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index a94346bc7ab321b0f5710a98fb3cc60198f148b0..a21e0a4ec321dbfe08f87160cc2f0c159594920d 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -117,9 +117,9 @@ class Attribute {
 
   template <typename Vistor>
   static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
-    if (attr.variant_.TypeId() == typeid(int).hash_code()) {
+    if (attr.variant_.TypeId() == typeid(int).hash_code()) {  // NOLINT
       return vistor(attr.variant_.Get<int>());
-    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {  // NOLINT
       return vistor(attr.variant_.Get<float>());
     } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
       return vistor(attr.variant_.GetString());
@@ -129,7 +129,7 @@ class Attribute {
       return vistor(attr.variant_.Get<vector<float>>());
     } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
       return vistor(attr.variant_.Get<vector<string>>());
-    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {  // NOLINT
       return vistor(attr.variant_.Get<bool>());
     } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
       return vistor(attr.variant_.Get<vector<bool>>());
@@ -137,7 +137,6 @@ class Attribute {
       return vistor(attr.variant_.Get<int64_t>());
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-      exit(0);
     }
   }
 
diff --git a/src/framework/cl/cl_deleter.h b/src/framework/cl/cl_deleter.h
new file mode 100644
index 0000000000000000000000000000000000000000..55af631174ae9f2a7815c2da35ebadda3ebfd9e9
--- /dev/null
+++ b/src/framework/cl/cl_deleter.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CL/cl.h"
+
+struct CLKernelDeleter {
+  template <class T>
+  void operator()(T *clKernelObj) {
+    clReleaseKernel(clKernelObj);
+  }
+};
+
+struct CLMemDeleter {
+  template <class T>
+  void operator()(T *clMemObj) {
+    clReleaseMemObject(clMemObj);
+  }
+};
+
+struct CLEventDeleter {
+  template <class T>
+  void operator()(T *clEventObj) {
+    clReleaseEvent(clEventObj);
+  }
+};
+
+struct CLCommQueueDeleter {
+  template <class T>
+  void operator()(T *clQueueObj) {
+    clReleaseCommandQueue(clQueueObj);
+  }
+};
+
+struct CLContextDeleter {
+  template <class T>
+  void operator()(T *clContextObj) {
+    clReleaseContext(clContextObj);
+  }
+};
+
+struct CLProgramDeleter {
+  template <class T>
+  void operator()(T *clProgramObj) {
+    clReleaseProgram(clProgramObj);
+  }
+};
diff --git a/src/framework/cl/cl_engine.cpp b/src/framework/cl/cl_engine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04d1675227aac0967f8dee94aa7a27ae5ea73c0f
--- /dev/null
+++ b/src/framework/cl/cl_engine.cpp
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_engine.h"
+#include "CL/cl.h"
+#include "framework/cl/cl_tool.h"
+
+#include <cstdlib>
+#include <cstring>
+
+namespace paddle_mobile {
+namespace framework {
+
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  cl_int status;
+  SetPlatform();
+  SetClDeviceId();
+
+  initialized_ = true;
+  return initialized_;
+  //  setClCommandQueue();
+  //  std::string filename = "./HelloWorld_Kernel.cl";
+  //  loadKernelFromFile(filename.c_str());
+  //  buildProgram();
+}
+
+CLEngine *CLEngine::Instance() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+
+bool CLEngine::SetPlatform() {
+  platform_ = NULL;      // the chosen platform
+  cl_uint numPlatforms;  // the NO. of platforms
+  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+  /**For clarity, choose the first available platform. */
+  if (numPlatforms > 0) {
+    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
+        malloc(numPlatforms * sizeof(cl_platform_id)));
+    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    platform_ = platforms[0];
+    free(platforms);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool CLEngine::SetClDeviceId() {
+  cl_uint numDevices = 0;
+  devices_ = NULL;
+  cl_int status =
+      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+
+  if (numDevices > 0) {
+    devices_ = reinterpret_cast<cl_device_id *>(
+        malloc(numDevices * sizeof(cl_device_id)));
+    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
+                            NULL);
+    return true;
+  }
+  return false;
+}
+
+// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
+//    const std::string &kernel_name) {
+//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
+//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
+//  return std::move(kernel);
+//}
+//
+// bool CLEngine::SetClCommandQueue() {
+//  cl_int status;
+//  command_queue_.reset(
+//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
+//  return true;
+//}
+
+// bool CLEngine::SetClContext() {
+//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
+//  return true;
+//}
+
+// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
+//  size_t size;
+//  char *str;
+//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
+//
+//  if (!f.is_open()) {
+//    return false;
+//  }
+//
+//  size_t fileSize;
+//  f.seekg(0, std::fstream::end);
+//  size = fileSize = (size_t)f.tellg();
+//  f.seekg(0, std::fstream::beg);
+//  str = new char[size + 1];
+//  if (!str) {
+//    f.close();
+//    return 0;
+//  }
+//
+//  f.read(str, fileSize);
+//  f.close();
+//  str[size] = '\0';
+//  const char *source = str;
+//  size_t sourceSize[] = {strlen(source)};
+//  program_.reset(
+//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
+//      NULL));
+//  return true;
+//}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7b1c912dac304660f39e0e294122d0d27eb9bb6
--- /dev/null
+++ b/src/framework/cl/cl_engine.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "CL/cl.h"
+#include "common/enforce.h"
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLEngine {
+ public:
+  static CLEngine *Instance();
+
+  bool Init();
+
+  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
+    cl_int status;
+    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
+    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
+    CL_CHECK_ERRORS(status);
+    return std::move(context_ptr);
+  }
+
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
+      cl_context context) {
+    cl_int status;
+    cl_command_queue queue =
+        clCreateCommandQueue(context, devices_[0], 0, &status);
+    std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
+        queue);
+    CL_CHECK_ERRORS(status);
+    return std::move(command_queue_ptr);
+  }
+
+  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
+      cl_context context, std::string file_name) {
+    FILE *file = fopen(file_name.c_str(), "rb");
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          file_name.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    char *data = new char[size + 1];
+    size_t bytes_read = fread(data, 1, size, file);
+    data[size] = '\0';
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+
+    const char *source = data;
+    size_t sourceSize[] = {strlen(source)};
+    cl_program p =
+        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
+
+    DLOG << " cl kernel file name: " << file_name;
+    DLOG << " source size: " << sourceSize[0];
+    CL_CHECK_ERRORS(status_);
+
+    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
+
+    return std::move(program_ptr);
+  }
+
+  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
+    cl_event event = clCreateUserEvent(context, &status_);
+    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
+    CL_CHECK_ERRORS(status_);
+    return std::move(event_ptr);
+  }
+
+  bool BuildProgram(cl_program program) {
+    cl_int status;
+    std::string path = "-cl-fast-relaxed-math -I " +
+                       CLEngine::Instance()->GetCLPath() + "/cl_kernel";
+
+    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
+
+    CL_CHECK_ERRORS(status);
+
+    if (status_ == CL_BUILD_PROGRAM_FAILURE) {
+      size_t log_size;
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+      char *log = reinterpret_cast<char *>(malloc(log_size));
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+      DLOG << " program build error: " << log;
+    }
+
+    if (status == CL_SUCCESS) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
+
+  std::string GetCLPath() { return cl_path_; }
+  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
+
+ private:
+  CLEngine() { initialized_ = false; }
+
+  bool SetPlatform();
+
+  bool SetClDeviceId();
+
+  bool initialized_;
+
+  cl_platform_id platform_;
+
+  cl_device_id *devices_;
+
+  cl_int status_;
+
+  std::string cl_path_;
+  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
+
+  //  bool SetClContext();
+
+  //  bool SetClCommandQueue();
+
+  //  bool LoadKernelFromFile(const char *kernel_file);
+
+  //  bool BuildProgram();
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_half.cpp b/src/framework/cl/cl_half.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2877289325d983d0c7d9756732254e0a4ed831b6
--- /dev/null
+++ b/src/framework/cl/cl_half.cpp
@@ -0,0 +1,518 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+
+#include "framework/cl/cl_half.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+static const uint32_t mantissatable[2048] = {
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+    0x387fc000, 0x387fe000};
+
+static const uint16_t offsettable[64] = {
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+
+static const uint32_t exponenttable[64] = {
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
+
+static const uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
+    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
+    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
+    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
+    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
+    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
+    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
+    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+
+static const uint8_t shifttable[512] = {
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+
+half_t Float2Half(float f) {
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
+  return basetable[(v >> 23) & 0x1ff] +
+         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
+}
+
+float Half2Float(half_t h) {
+  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
+               exponenttable[h >> 10];
+  return *reinterpret_cast<float *>(&v);
+}
+
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = Float2Half(f_array[i]);
+  }
+}
+
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = Half2Float(h_array[i]);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_half.h b/src/framework/cl/cl_half.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b05740f1e19af66036a1562243102e5ba42ab1b
--- /dev/null
+++ b/src/framework/cl/cl_half.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstdint>
+
+namespace paddle_mobile {
+namespace framework {
+
+typedef uint16_t half_t;
+
+half_t Float2Half(float f);
+
+float Half2Float(half_t h);
+
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea91ee24ceb5e9011708bd277629a07beb4b8ef
--- /dev/null
+++ b/src/framework/cl/cl_helper.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_image.h"
+#include "framework/cl/cl_scope.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLHelper {
+ public:
+  CLHelper() = default;
+
+  explicit CLHelper(CLScope *scope) : scope_(scope) {}
+
+  void AddKernel(const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " begin add kernel ";
+    auto kernel = scope_->GetKernel(kernel_name, file_name);
+    DLOG << " add kernel ing ";
+    kernels.emplace_back(std::move(kernel));
+  }
+
+  cl_kernel KernelAt(const int index) {
+    DLOG << " kernel count: " << kernels.size();
+    return kernels[index].get();
+  }
+
+  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
+
+  cl_context CLContext() { return scope_->Context(); }
+
+  std::vector<size_t> DefaultWorkSize(const CLImage &image) {
+    // n c h w
+    auto image_dim = image.dims();
+    if (image_dim.size() == 4) {
+      auto n = image_dim[0];
+      auto h = image_dim[2];
+      auto w = image_dim[3];
+      auto image_width = image.ImageWidth();
+      auto work_size_0 = image_width / w;
+      auto work_size_1 = w;
+      auto work_size_2 = n * h;
+      return {work_size_0, work_size_1, work_size_2};
+    } else if (image_dim.size() == 2) {
+      return {1, image.ImageWidth(), image.ImageHeight()};
+    } else if (image_dim.size() == 1) {
+      return {1, image.ImageWidth(), 1};
+    }
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
+  }
+
+ private:
+  CLScope *scope_;
+  std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5
--- /dev/null
+++ b/src/framework/cl/cl_image.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_image.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+
+void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &cl_image) {
+  int width = cl_image.ImageDims()[0];
+  int height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  printer << " dims: " << cl_image.dims() << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    printer << tensor_data[i] << " ";
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+
+  return printer;
+}
+#endif
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..35f60d3b773937d381447b23b64985ce543fddee
--- /dev/null
+++ b/src/framework/cl/cl_image.h
@@ -0,0 +1,234 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "CL/cl.h"
+
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_half.h"
+#include "framework/cl/cl_image_converter.h"
+#include "framework/cl/cl_tool.h"
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLImage {
+ public:
+  CLImage() = default;
+
+  ~CLImage() {
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+    }
+
+    if (image_converter_) {
+      delete (image_converter_);
+    }
+  }
+  /*
+   * will not hold input tensor data, memcpy in this method
+   * */
+  void SetTensorData(float *tensorData, const DDim &dim) {
+    int numel = product(dim);
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+      tensor_data_ = nullptr;
+    }
+    tensor_data_ = new float[numel];
+    memcpy(tensor_data_, tensorData, numel * sizeof(float));
+    tensor_dims_ = dim;
+  }
+
+  /*
+   * need call SetTensorData first
+   *
+   * folder when one dim or two dim
+   * */
+  void InitCLImage(cl_context context, cl_command_queue command_queue) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    InitCLImage(context, command_queue, folder_converter);
+  }
+
+  void InitCLImage(cl_context context, cl_command_queue command_queue,
+                   CLImageConverterBase *converter) {
+    if (image_converter_ != nullptr) {
+      delete (image_converter_);
+    }
+
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+
+    DLOG << " begin init cl image ";
+    image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+
+    half_t *image_data = new half_t[product(image_dims_) * 4];
+
+    DLOG << " convert to image";
+    converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
+    DLOG << " end convert to image";
+
+    InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+
+    delete[](image_data);
+    delete[](tensor_data_);
+
+    command_queue_ = command_queue;
+    tensor_data_ = nullptr;
+    image_converter_ = converter;
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+
+  void InitNImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
+    InitCLImage(context, command_queue, folder_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+  void InitDWImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
+    InitCLImage(context, command_queue, dw_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+
+  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
+                      const DDim &dim) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
+                          " empty image tensor data shouldn't have value");
+
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+
+    DLOG << " to get image dims ";
+    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    DLOG << " end get image dims " << image_dims_;
+
+    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+    tensor_dims_ = dim;
+    command_queue_ = command_queue;
+    image_converter_ = folder_converter;
+    cl_event_ = CLEngine::Instance()->CreateEvent(context);
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+
+  cl_mem GetCLImage() const { return cl_image_.get(); }
+
+  const DDim &ImageDims() const { return image_dims_; }
+
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+
+  inline cl_command_queue CommandQueue() const { return command_queue_; }
+
+  /*
+   *  resize original tensor dim
+   * */
+  inline CLImage &Resize(const DDim &dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  T *data() const {
+    if (initialized_) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " cl image has initialized, tensor data has been deleted, can't use "
+          "tensor data");
+    }
+    return reinterpret_cast<T *>(tensor_data_);
+  }
+
+  /*
+   *  numel of tensor dim
+   * */
+  inline int64_t numel() const { return product(tensor_dims_); }
+
+  /*
+   *  original tensor dim
+   * */
+  const DDim &dims() const { return tensor_dims_; }
+
+  cl_event GetClEvent() const { return cl_event_.get(); }
+
+  CLImageConverterBase *Converter() const { return image_converter_; }
+
+ private:
+  void InitCLImage(cl_context context, int width, int height, void *data) {
+    cl_image_format cf = {.image_channel_order = CL_RGBA,
+                          .image_channel_data_type = CL_HALF_FLOAT};
+    cl_image_desc cid = {
+        .image_type = CL_MEM_OBJECT_IMAGE2D,
+        .image_width = width,
+        .image_height = height,
+        .image_depth = 1,
+        .image_array_size = 1,
+        .image_row_pitch = 0,
+        .image_slice_pitch = 0,
+        .num_mip_levels = 0,
+        .num_samples = 0,
+        // .buffer = nullptr
+    };
+    cid.buffer = nullptr;
+    cl_int err;
+    cl_mem cl_image = clCreateImage(
+        context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+        &cf,   // const cl_image_format *image_format
+        &cid,  // const cl_image_desc *image_desc
+        data,  // void *host_ptr
+        &err);
+    cl_image_.reset(cl_image);
+    if (err != CL_SUCCESS) {
+      CL_CHECK_ERRORS(err);
+      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
+    }
+  }
+
+  bool initialized_ = false;
+  std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
+  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
+  DDim tensor_dims_;
+  DDim image_dims_;
+  float *tensor_data_ = nullptr;
+  cl_context context_;
+  cl_command_queue command_queue_;
+  CLImageConverterBase *image_converter_ = nullptr;
+};
+
+void TensorToCLImage(Tensor *tensor, CLImage *image,
+                     cl_command_queue commandQueue);
+
+void CLImageToTensor(CLImage *image, Tensor *tensor,
+                     cl_command_queue commandQueue);
+
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &image);
+#endif
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13094a8d05ac6f7f8d2451a3498da058b37ee98b
--- /dev/null
+++ b/src/framework/cl/cl_image_converter.cpp
@@ -0,0 +1,393 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_image_converter.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const DDim &CLImageConverterDefault::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+
+  int w_block = width / W;
+
+  float *p = nchw;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  int width = image_dim[0];
+  int height = image_dim[0];
+
+  float *p = tensor;
+
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+const DDim &CLImageConverterFolder::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    int width = (tdim[1] + 3) / 4;
+    int height = tdim[0];
+
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+
+    return make_ddim({width, height});
+
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (int j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+
+    return make_ddim({width, height});
+  }
+}
+
+void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+
+  } else {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    int width = image_dim[0];
+
+    for (int h = 0; h < tdim[0]; h++) {
+      for (int w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
+      }
+    }
+  }
+}
+
+void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+
+  } else {
+    int width = image_dim[0];
+    int height = image_dim[1];
+    int H, W;
+
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      H = 1;
+      W = tensor_dim[0];
+    }
+    float *p = tensor;
+
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+      }
+    }
+  }
+}
+
+const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+
+  for (int n = 0; n < block * 4; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          if (n < N) {
+            image[index] = Float2Half(*p);
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          *p = Half2Float(image[index]);
+          p++;
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+
+const DDim &CLImageConverterDWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+
+  int w_block = width / W;
+
+  float *p = tensor;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[1];
+  int C = tensor_dim[0];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[0];
+
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..02887b0cd468a45630122bb3f236c0775ac1eaa1
--- /dev/null
+++ b/src/framework/cl/cl_image_converter.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/cl/cl_half.h"
+#include "framework/ddim.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLImageConverterBase {
+ public:
+  virtual void NCHWToImage(float *nchw, half_t *image,
+                           const DDim &tensor_dim) = 0;
+
+  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7c06ca75f47cd65d2350dfa6930068aca73ced0
--- /dev/null
+++ b/src/framework/cl/cl_scope.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLScope {
+ public:
+  CLScope() {
+    CLEngine *engin = CLEngine::Instance();
+    context_ = engin->CreateContext();
+    command_queue_ = engin->CreateClCommandQueue(context_.get());
+  }
+
+  cl_command_queue CommandQueue() { return command_queue_.get(); }
+
+  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
+      const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " to get program " << file_name;
+    auto program = Program(file_name);
+    DLOG << " end get program ~ ";
+    DLOG << " to create kernel: " << kernel_name;
+    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
+        clCreateKernel(program, kernel_name.c_str(), &status_));
+    CL_CHECK_ERRORS(status_);
+    DLOG << " end create kernel ~ ";
+    return std::move(kernel);
+  }
+
+  cl_context Context() { return context_.get(); }
+
+  cl_program Program(const std::string &file_name) {
+    auto it = programs_.find(file_name);
+    if (it != programs_.end()) {
+      return it->second.get();
+    }
+
+    auto program = CLEngine::Instance()->CreateProgramWith(
+        context_.get(),
+        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
+
+    DLOG << " --- begin build program -> " << file_name << " --- ";
+    CLEngine::Instance()->BuildProgram(program.get());
+    DLOG << " --- end build program -> " << file_name << " --- ";
+
+    programs_[file_name] = std::move(program);
+
+    return programs_[file_name].get();
+  }
+
+ private:
+  cl_int status_;
+  std::unique_ptr<_cl_context, CLContextDeleter> context_;
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  std::unordered_map<std::string,
+                     std::unique_ptr<_cl_program, CLProgramDeleter>>
+      programs_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..b853fa0e8d734c38de2fdc53f766d735dc72bb20
--- /dev/null
+++ b/src/framework/cl/cl_tensor.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/tensor_base.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLTensor : TensorBase {
+ public:
+  CLTensor(cl_context context, cl_command_queue command_queue)
+      : context_(context), command_queue_(command_queue) {}
+
+  CLTensor() = default;
+
+  /*
+   * if init method haven't set context and command_queue, need set
+   * */
+  void SetContextAndCommandQueue(cl_context context,
+                                 cl_command_queue command_queue) {
+    context_ = context;
+    command_queue_ = command_queue;
+  }
+
+  /*! Resize the dimensions of the memory block. */
+  inline CLTensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  inline cl_mem mutable_with_data(const T *data) {
+    int64_t size = numel() * sizeof(T);
+
+    holder_.reset(new PlaceholderImpl(
+        size, reinterpret_cast<void *>(const_cast<T *>(data)), typeid(T),
+        context_, command_queue_));
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+
+  inline cl_mem mutable_data(std::type_index type) {
+    if (holder_ != nullptr) {
+      holder_->set_type(type);
+    }
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+    int64_t size = numel() * SizeOfType(type);
+    if (holder_ == nullptr || holder_->size() < size + offset_) {
+      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
+      offset_ = 0;
+    }
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+
+  /**
+   * @brief   Return a pointer to cl buffer.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data() {
+    return reinterpret_cast<cl_mem>(mutable_data(typeid(T)));
+  }
+
+  /**
+   * @brief     Return a pointer to cl buffer.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data(DDim dims) {
+    Resize(dims);
+    return mutable_data<T>();
+  }
+
+  inline cl_mem CLBuffer() {
+    check_memory_size();
+    return reinterpret_cast<cl_mem>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()));
+  }
+
+  template <typename T>
+  inline T *Data() {
+    if (host_ptr_) {
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+    cl_mem buffer = CLBuffer();
+    host_ptr_ = new char[holder_->size()];
+    cl_int status;
+    status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
+                                 holder_->size(), host_ptr_, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+    return reinterpret_cast<T *>(host_ptr_);
+  }
+
+  int memorySize() { return holder_->size(); }
+
+  ~CLTensor() {
+    DLOG << "~CLTensor";
+    if (host_ptr_) {
+      DLOG << " delete host ptr ";
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+  }
+
+ private:
+  cl_context context_;
+  cl_command_queue command_queue_;
+  void *host_ptr_ = nullptr;
+
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(size_t size, void *input, std::type_index type,
+                    cl_context context, cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                              size, reinterpret_cast<void *>(input), NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+
+    PlaceholderImpl(size_t size, std::type_index type, cl_context context,
+                    cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+
+    virtual size_t size() const { return size_; }
+
+    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
+
+    virtual std::type_index type() const { return type_; }
+
+    virtual void set_type(std::type_index type) { type_ = type; }
+
+    std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
+
+    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
+
+    cl_command_queue command_queue_;
+  };
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_tool.cpp b/src/framework/cl/cl_tool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..827642b6b73cfaee02f4053dce798bf6b3c52f4b
--- /dev/null
+++ b/src/framework/cl/cl_tool.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/cl/cl_tool.h b/src/framework/cl/cl_tool.h
new file mode 100644
index 0000000000000000000000000000000000000000..25d5bfc584b59e4fe9d22a922b601f8c32892fd1
--- /dev/null
+++ b/src/framework/cl/cl_tool.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CL/cl.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const char* opencl_error_to_str(cl_int error);
+
+#define CL_CHECK_ERRORS(ERR)                                          \
+  if (ERR != CL_SUCCESS) {                                            \
+    printf(                                                           \
+        "OpenCL error with code %s happened in file %s at line %d. "  \
+        "Exiting.\n",                                                 \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
+        __LINE__);                                                    \
+  }
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h
index 0ba31ef9b7016b453b34cc4a023b0841b2110540..665b5315bc1c0fca7b9e62f89062f375a9a011be 100644
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
     return DataLayout::kAnyLayout;
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-    exit(0);
   }
 }
 
@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
       return "ANY_LAYOUT";
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      exit(0);
       break;
   }
 }
diff --git a/src/framework/dim.h b/src/framework/dim.h
index 85e86076e1de53fa80b75f56237901da49e22eb9..7c78659e3baacdf707dc46884c099dfd0cd284bb 100644
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -42,7 +42,7 @@ struct Dim {
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  explicit Dim(int64_t idx) : head(idx), tail(idx) {}
 
   bool operator==(const Dim<i> &o) const {
     return (head == o.head) && (tail == o.tail);
@@ -65,7 +65,7 @@ template <>
 struct Dim<0> {
   static constexpr int dimensions = 0;
 
-  Dim(int64_t _head) {}
+  explicit Dim(int64_t _head) {}
 
   Dim() {}
 
@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }
 
 template <int D>
@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }
 
 }  // namespace
diff --git a/src/io/executor.cpp b/src/framework/executor.cpp
similarity index 64%
rename from src/io/executor.cpp
rename to src/framework/executor.cpp
index 9efec27c9df3d51a3411db87faee924b374d2ac7..c7ef09ed5a1466a7396ec9c177eb3c48abd91ad7 100644
--- a/src/io/executor.cpp
+++ b/src/framework/executor.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io/executor.h"
+#include "framework/executor.h"
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -26,12 +26,26 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#include "operators/math/gemm.h"
+#include "memory/t_malloc.h"
+
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
+
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif
 
 namespace paddle_mobile {
+namespace framework {
 
+using framework::Variable;
 using framework::Variable;
 
+#pragma mark - executor
+
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
                              const bool use_optimize, const bool loddable)
@@ -73,8 +87,10 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   }
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
+  int i = 0;
   auto &ops = ops_of_block_[*to_predict_block.get()];
   for (const auto &op : ops) {
+    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
     op->Init();
   }
 }
@@ -89,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     // should be moved into operator init function
     float min_value;
     float max_value;
-    memcpy(&min_value, data_buf, sizeof(float));
-    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
+    memory::Copy(&min_value, data_buf, sizeof(float));
+    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
     data_buf += 2 * sizeof(float);
     const float factor = (max_value - min_value) / 255.0;
     const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
@@ -99,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
     }
     data_buf += size * sizeof(uint8_t);
   } else {
-    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
     *data_buf += size * sizeof(Dtype);
   }
 }
@@ -115,7 +131,7 @@ void Executor<Dtype, P>::LoadMemory(
   // lod information
   // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
   uint64_t lod_level = 0;
-  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
+  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
   *data_buf += sizeof(uint64_t);
 
   auto *lod = tensor->mutable_lod();
@@ -124,7 +140,7 @@ void Executor<Dtype, P>::LoadMemory(
     uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
     *data_buf += sizeof(uint64_t);
     std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memcpy(tmp_dim.data(), *data_buf, size);
+    memory::Copy(tmp_dim.data(), *data_buf, size);
     (*lod)[i] = std::move(tmp_dim);
     *data_buf += size;
   }
@@ -390,13 +406,18 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
   framework::Tensor tensor(input, framework::make_ddim(dims));
   std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  Executor<Dtype, P>::Ptype *output_ptr =
-      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-  for (int j = 0; j < output_tensor->numel(); ++j) {
-    result_vector.push_back(output_ptr[j]);
+  if (output_tensor != nullptr) {
+    Executor<Dtype, P>::Ptype *output_ptr =
+        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+    for (int j = 0; j < output_tensor->numel(); ++j) {
+      result_vector.push_back(output_ptr[j]);
+    }
+    return result_vector;
+  } else {
+    DLOG << "return  empty vector";
+    return {};
   }
-  return result_vector;
 }
 
 #ifdef PADDLE_MOBILE_FPGA
@@ -470,8 +491,236 @@ void Executor<Dtype, P>::Predict_To(int end) {
 }
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+                                    float *tensorInput, char **data) {}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::LoadMemory(
+    const framework::VarDesc var_desc, float *tensorInput, char **data) {
+  // 1. version
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+
+  (*data) += sizeof(uint32_t);
+
+  // 2 Lod information
+  uint64_t *lod_level_ptr = new uint64_t();
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  (*data) += sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
+    std::vector<size_t> tmp(size / sizeof(size_t));
+
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
+    }
+  }
+
+  // 3. tensor version
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
+
+  // 4. tensor desc
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
+
+  std::unique_ptr<char[]> buf(new char[size]);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = (*data)[m];
+  }
+  (*data) += (sizeof(char) * size);
+
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+
+  void *memory = nullptr;
+  //            int type_size = 0;
+  //            switch (desc.DataType()) {
+  //                case framework::VARTYPE_TYPE_FP16:
+  //                    type_size = 2;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP32:
+  //                    type_size = 4;
+  //                    memory = tensor->mutable_data<float>();
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT32:
+  //                    memory = tensor->mutable_data<int32_t>();
+  //                    type_size = 4;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_BOOL:
+  //                    type_size = 1;
+  //                    break;
+  //                default:
+  //                    break;
+  //            }
+  int type_size = 4;
+  memory = tensorInput;
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size; n++) {
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
+  }
+}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitMemory() {
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+
+        char *origin_data =
+            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        int numel = 1;
+        for (auto l : desc.Dims()) {
+          numel *= l;
+        }
+        DLOG << var_desc->Name();
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &data);
+
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+
+        delete origin_data;
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto cl_image = var->template GetMutable<framework::CLImage>();
+          cl_context context = program_.scope->GetCLScpoe()->Context();
+          cl_command_queue command_queue =
+              program_.scope->GetCLScpoe()->CommandQueue();
+
+          const framework::TensorDesc &desc = var_desc->Tensor_desc();
+          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
+          framework::DDim ddim = cl_image->dims();
+          DLOG << var_desc->Name();
+          cl_image->InitEmptyImage(context, command_queue, ddim);
+        }
+      }
+    }
+  }
+}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
+  char *origin_data = nullptr;
+  bool self_alloc = false;
+  if (program_.combined_params_buf && program_.combined_params_len) {
+    LOG(kLOG_INFO) << "use outter memory";
+    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
+  } else {
+    LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
+    origin_data = ReadFileToBuff(program_.para_path);
+  }
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  float *data = reinterpret_cast<float *>(origin_data);
+
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+
+        int numel = 1;
+        for (int i = 0; i < ddim.size(); i++) {
+          numel = numel * ddim[i];
+        }
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &origin_data);
+
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        auto cl_image = var->template GetMutable<framework::CLImage>();
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        cl_command_queue command_queue =
+            program_.scope->GetCLScpoe()->CommandQueue();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = cl_image->dims();
+        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        cl_image->InitEmptyImage(context, command_queue, ddim);
+      }
+    }
+  }
+  if (self_alloc) {
+    delete data;
+  }
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
+
+#endif
+
 template class Executor<CPU, Precision::FP32>;
-template class Executor<GPU_MALI, Precision::FP32>;
+
 template class Executor<FPGA, Precision::FP32>;
 
+template class Executor<GPU_CL, Precision::FP32>;
+
+template class Executor<GPU_MALI, Precision::FP32>;
+
+}  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/io/executor.h b/src/framework/executor.h
similarity index 95%
rename from src/io/executor.h
rename to src/framework/executor.h
index 98906749effb7e46318157085c4505c57726ec62..be1c87e239c9c2ace9b4791f9769c176c9d5ef8e 100644
--- a/src/io/executor.h
+++ b/src/framework/executor.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
+namespace framework {
 
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
@@ -79,7 +80,10 @@ class Executor {
   void LoadMemory(void **data,
                   const std::shared_ptr<framework::VarDesc> var_desc,
                   framework::LoDTensor *tensor);
-
+#ifdef PADDLE_MOBILE_CL
+  void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
+                  char **data);
+#endif
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
@@ -97,4 +101,5 @@ class Executor {
   bool loddable_ = false;
 };
 
+}  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
index 4c6842572e49daa283efa2d92bd43e4687d92e26..982f1c0f3525afde8475866c0121343fafc9d5a0 100644
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -109,9 +109,15 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
 #ifdef RESHAPE_OP
 LOAD_OP2(reshape, CPU, MALI_GPU);
 #endif
+#ifdef RESHAPE2_OP
+LOAD_OP2(reshape2, CPU, MALI_GPU);
+#endif
 #ifdef TRANSPOSE_OP
 LOAD_OP1(transpose, CPU);
 #endif
+#ifdef TRANSPOSE2_OP
+LOAD_OP1(transpose2, CPU);
+#endif
 #ifdef PRIORBOX_OP
 LOAD_OP1(prior_box, CPU);
 #endif
diff --git a/src/io/loader.cpp b/src/framework/loader.cpp
similarity index 51%
rename from src/io/loader.cpp
rename to src/framework/loader.cpp
index 7dd55950be240a88a7521d4be260416625419015..5587d0698fa2b9a04532deae618545d15ecd631f 100644
--- a/src/io/loader.cpp
+++ b/src/framework/loader.cpp
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io/loader.h"
+#include "framework/loader.h"
 
 #include "framework/lod_tensor.h"
 #include "framework/program/program-optimize/program_optimize.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif
 
 namespace paddle_mobile {
-using framework::Variable;
+namespace framework {
 
 /**
  * muteandresize tensor as originProgramDesc and scope in loadParams
@@ -26,23 +29,57 @@ using framework::Variable;
  * @param originProgramDesc
  * @param scope
  */
-void InitMemoryFromProgram(
-    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,  // NOLINT
-    std::shared_ptr<framework::Scope> &scope) {                  // NOLINT
+template <typename Dtype, Precision P>
+void Loader<Dtype, P>::InitMemoryFromProgram(
+    const std::shared_ptr<ProgramDesc> &originProgramDesc,
+    const std::shared_ptr<Scope> &scope) {
+  for (const auto &block : originProgramDesc.get()->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = scope.get()->Var(var_desc->Name());
+      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable()) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<LoDTensor>();
+          tensor->Resize(make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          //          dim[0] = 1;
+          for (auto &d : dim) {
+            if (d < 0) {
+              d *= -1;
+            }
+          }
+          auto tensor = var->GetMutable<LoDTensor>();
+          tensor->Resize(make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+}
+
+#ifdef PADDLE_MOBILE_CL
+template <>
+void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
+    const std::shared_ptr<ProgramDesc> &originProgramDesc,
+    const std::shared_ptr<Scope> &scope) {
   for (const auto &block : originProgramDesc.get()->Blocks()) {
     for (const auto &var_desc : block->Vars()) {
       auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
         if (var_desc->Persistable()) {
           auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
+          //              auto tensor = var->GetMutable<LoDTensor>();
+          auto cl_image = var->GetMutable<framework::CLImage>();
+          cl_image->Resize(make_ddim(dim));
         } else {
           auto dim = var_desc->Tensor_desc().Dims();
           PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
           dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
+          auto cl_image = var->GetMutable<framework::CLImage>();
+          cl_image->Resize(make_ddim(dim));
         }
       } else {
         // TODO(codeWorm): some.
@@ -50,6 +87,56 @@ void InitMemoryFromProgram(
     }
   }
 }
+template <>
+const Program<GPU_CL, Precision::FP32>
+Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+    size_t read_size, const uint8_t *buf, size_t combined_params_len,
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+  bool can_add_split = false;
+
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      nullptr, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+
+  Program<GPU_CL, Precision::FP32> program;
+  program.combined = true;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = combined_params_len;
+  program.combined_params_buf = combined_params_buf;
+
+  auto scope = std::make_shared<Scope>();
+  program.scope = scope;
+  InitMemoryFromProgram(originProgramDesc, scope);
+  if (optimize) {
+    ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
+                                                               nullptr);
+  return program;
+}
+
+#endif
+
 /**
  * fusion and print someinfos
  * @tparam Dtype
@@ -61,19 +148,18 @@ void InitMemoryFromProgram(
  */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split,
-    framework::Program<Dtype, P> &program,  // NOLINT
-    const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
+    bool optimize, bool can_add_split, Program<Dtype, P> *program,
+    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
   if (optimize) {
-    framework::ProgramOptimize program_optimize;
-    program.optimizeProgram =
+    ProgramOptimize program_optimize;
+    program->optimizeProgram =
         program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program.optimizeProgram) {
-      program.optimizeProgram = originProgramDesc;
+    if (!program->optimizeProgram) {
+      program->optimizeProgram = originProgramDesc;
     }
   }
   if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
+    program->optimizeProgram->Description("optimize: ");
   } else {
     originProgramDesc->Description("program: ");
   }
@@ -102,9 +188,10 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 }
 
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool quantification,
-    bool can_add_split) {
+const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
+                                               bool optimize,
+                                               bool quantification,
+                                               bool can_add_split) {
   auto program = this->LoadProgram(dirname + "/__model__", optimize,
                                    quantification, can_add_split);
   program.model_path = dirname;
@@ -112,9 +199,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path, bool optimize,
-    bool quantification) {
+const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
+                                               const std::string &para_path,
+                                               bool optimize,
+                                               bool quantification) {
   auto program = this->LoadProgram(model_path, optimize, quantification);
 
   program.para_path = para_path;
@@ -124,7 +212,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
     const std::string &model_path, bool optimize, bool quantification,
     bool can_add_split) {
   std::string model_filename = model_path;
@@ -141,29 +229,29 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
   //
   DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
   //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
 
-  framework::Program<Dtype, P> program;
+  Program<Dtype, P> program;
   program.originProgram = originProgramDesc;
   program.quantification = quantification;
   program.combined_params_len = 0;
   program.combined_params_buf = nullptr;
-  auto scope = std::make_shared<framework::Scope>();
+  auto scope = std::make_shared<Scope>();
   program.scope = scope;
 
   // use  originProgramDesc and scope to init tensors
   InitMemoryFromProgram(originProgramDesc, scope);
   // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
 
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
   return program;
 }
 
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
+const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
     size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
   bool can_add_split = false;
 
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -177,26 +265,31 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
   DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
   //
 
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
 
-  framework::Program<Dtype, P> program;
+  Program<Dtype, P> program;
   program.combined = true;
   program.originProgram = originProgramDesc;
   program.quantification = quantification;
   program.combined_params_len = combined_params_len;
   program.combined_params_buf = combined_params_buf;
 
-  auto scope = std::make_shared<framework::Scope>();
+  auto scope = std::make_shared<Scope>();
   program.scope = scope;
   InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
   paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
                                                                nullptr);
   return program;
 }
 
 template class Loader<CPU, Precision::FP32>;
+
 template class Loader<FPGA, Precision::FP32>;
+
 template class Loader<GPU_MALI, Precision::FP32>;
 
+template class Loader<GPU_CL, Precision::FP32>;
+
+}  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/loader.h b/src/framework/loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..3200f0b25368fa123b80c51000cfd6c6a6d084b6
--- /dev/null
+++ b/src/framework/loader.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "common/types.h"
+#include "framework/program/program.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const Program<Dtype, P> Load(const std::string &dirname,
+                               bool optimize = false,
+                               bool quantification = false,
+                               bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const Program<Dtype, P> Load(const std::string &model_path,
+                               const std::string &para_path,
+                               bool optimize = false,
+                               bool quantification = false);
+
+  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
+                                             const uint8_t *model_buf,
+                                             size_t combined_params_len,
+                                             uint8_t *combined_params_buf,
+                                             bool optimize = false,
+                                             bool quantification = false);
+
+ private:
+  const Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                      bool optimize = false,
+                                      bool quantification = false,
+                                      bool can_add_split = false);
+
+  void InitMemoryFromProgram(
+      const std::shared_ptr<ProgramDesc> &originProgramDesc,
+      const std::shared_ptr<Scope> &scope);
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h
index 32954531d0854b3318185aacdf99314051f98f6a..219385ab1429fefddc9d380799259f7562e0030f 100644
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <tuple>
+
 #include "common/log.h"
 #include "common/type_define.h"
 #include "framework/op_info.h"
@@ -120,5 +122,8 @@ class OpRegistry {
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
   REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
 
+#define REGISTER_OPERATOR_CL(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
+
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 21b14dfcac682e7d310dcf4e8c47afaa0fb68fb3..e0b40cebf7f14e0b927e4666d63e740213918333 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -56,7 +56,7 @@ template <typename Dtype>
 void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 
 template <typename Dtype>
-void OperatorBase<Dtype>::Run() const {
+void OperatorBase<Dtype>::Run() {
   RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
   DLOG << "-------------" << type_ << "----------------------------";
@@ -84,9 +84,57 @@ void OperatorBase<Dtype>::Run() const {
 #endif
 }
 
+#ifdef PADDLE_MOBILE_CL
+template <>
+void OperatorBase<GPU_CL>::Run() {
+  RunImpl();
+#ifdef PADDLE_MOBILE_DEBUG
+  DLOG << "-------------" << type_ << "----------------------------";
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    auto var_vec_in = inputs_.at(key);
+    for (int i = 0; i < var_vec_in.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_in[i]);
+      if (vari->IsInitialized()) {
+        if (type_ == "feed") {
+          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+        } else {
+          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " input- " << key << "=" << *cl_image;
+          }
+        }
+      }
+    }
+  }
+  for (const auto key : GetOutKeys()) {
+    auto var_vec_out = outputs_.at(key);
+    for (int i = 0; i < var_vec_out.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_out[i]);
+      if (vari->IsInitialized()) {
+        if (type_ == "fetch") {
+          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          if (tensor) {
+            DLOG << type_ << " output- " << key << "=" << *tensor;
+          }
+        } else {
+          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " output- " << key << "=" << *cl_image;
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+#endif
+
 template class OperatorBase<CPU>;
 template class OperatorBase<FPGA>;
 template class OperatorBase<GPU_MALI>;
+template class OperatorBase<GPU_CL>;
 
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 0a9127f079f3c30acbc9f9c7cf0518d7354b5431..464910b613322451d05adcc772825079d0d8f677 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -32,7 +32,10 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_scope.h"
+#endif
 namespace paddle_mobile {
 namespace framework {
 using std::string;
@@ -60,10 +63,10 @@ class OperatorBase {
                const VariableNameMap &outputs, const AttributeMap &attrs,
                std::shared_ptr<Scope> scope);
   virtual ~OperatorBase() {}
-  void Run() const;
+  void Run();
   std::vector<string> GetOutKeys() const;
   std::vector<string> GetInputKeys() const;
-  virtual void RunImpl() const = 0;
+  virtual void RunImpl() = 0;
 
   virtual void Init() = 0;
   /*
@@ -113,9 +116,13 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      std::shared_ptr<Scope> scope)
       : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
+        param_(inputs, outputs, attrs, *scope) {
+#ifdef PADDLE_MOBILE_CL
+    kernel_.InitCLHelper(scope->GetCLScpoe());
+#endif
+  }
 
-  virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
+  virtual void RunImpl() { this->kernel_.Compute(this->param_); }
 
   virtual void InferShape() const = 0;
 
@@ -135,22 +142,35 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
 template <typename Dtype, typename P>
 class OpKernelBase {
  public:
-  /*
-   * @b 所有kernel 需实现 Compute 方法
-   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
-   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
-   * */
-#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() = default;
+
+#ifdef PADDLE_MOBILE_CL
+  virtual void InitCLHelper(CLScope *clScope) {
+    cl_helper_ = CLHelper(clScope);
+  }
+#endif
+
+    /*
+     * @b 所有kernel 需实现 Compute 方法
+     * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+     *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+     * */
+#ifdef PADDLE_McOBILE_MALI_GPU
   OpKernelBase() { acl_op_ = nullptr; }
   void *GetAclOp() const { return acl_op_; }
   void SetAclOp(void *op, void *ob) const {
     reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
   }
 #endif
-  virtual void Compute(const P &para) const = 0;
+  virtual void Compute(const P &para) = 0;
   virtual bool Init(P *para) { return true; }
   virtual ~OpKernelBase() = default;
 
+ protected:
+#ifdef PADDLE_MOBILE_CL
+  CLHelper cl_helper_;
+#endif
+
  private:
 #ifdef PADDLE_MOBILE_MALI_GPU
   void *acl_op_;
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 696cf75b91ff88837cffd3304f5fe3cd491e77eb..6a25b1c40bd5c1b74ded54ee4134d71c77b15244 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"
 
+#include <string>
+
 namespace paddle_mobile {
 namespace framework {
 
@@ -32,7 +34,7 @@ class Program {
   bool combined = false;
   bool quantification = false;
   size_t combined_params_len;
-  const uint8_t *combined_params_buf;
+  uint8_t *combined_params_buf;
 };
 
 }  // namespace framework
diff --git a/src/framework/scope.h b/src/framework/scope.h
index 054f141ff68895e0879fd31e15d90c76ea038135..abc727231a0d119ab53d765ab020085aaab9102d 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -15,8 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <string>
 #include <unordered_map>
-#include "variable.h"
+#include <vector>
+
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_scope.h"
+#endif
+#include "framework/variable.h"
 
 namespace paddle_mobile {
 namespace framework {
@@ -33,6 +39,10 @@ class Scope {
       delete kid;
     }
     kids_.clear();
+
+#ifdef PADDLE_MOBILE_CL
+    delete cl_scope_;
+#endif
   }
 
   Scope &NewScope() const;
@@ -72,6 +82,10 @@ class Scope {
 
   Variable *FindVarLocally(const std::string &name) const;
 
+#ifdef PADDLE_MOBILE_CL
+  CLScope *GetCLScpoe() { return cl_scope_; }
+#endif
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const *parent) : parent_(parent) {}
@@ -79,6 +93,10 @@ class Scope {
   mutable std::unordered_map<std::string, Variable *> vars_;
   mutable std::list<Scope *> kids_;
   Scope const *parent_{nullptr};
+
+#ifdef PADDLE_MOBILE_CL
+  CLScope *cl_scope_ = new CLScope();
+#endif
 };
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 8609d8d1530495526302ee50dd5b83ea3d220b1a..9e6ae7288b755d40973264f8744c7c54f73193bd 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -24,65 +24,24 @@ limitations under the License. */
 #include <vector>
 
 #include "common/enforce.h"
-#include "common/types.h"
 #include "framework/data_layout.h"
-#include "framework/ddim.h"
+#include "framework/tensor_base.h"
 #include "memory/t_malloc.h"
 
 namespace paddle_mobile {
 namespace framework {
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
-                    size_t>
-      functor;
-  size_t size = functor(type);
-
-  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
 
 class LoDTensor;
 
-class Tensor {
+class Tensor : public TensorBase {
  public:
-  Tensor() : offset_(0) {}
+  Tensor() {}
   template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) : offset_(0) {
+  Tensor(std::vector<T> input, DDim ddim) {
     PADDLE_MOBILE_ENFORCE(
         input.size() == framework::product(ddim),
         "input vector'length should be equal to tensor's length");
+
     auto input_ptr = mutable_data<T>(ddim);
     for (int i = 0; i < input.size(); ++i) {
       input_ptr[i] = input[i];
@@ -95,44 +54,19 @@ class Tensor {
     this->offset_ = inTensor.offset_;
   }
 
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s",
-        this->holder_->type().name());
-
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s ,requested:%s",
-        this->holder_->type().name(), typeid(T).name());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    return *this;
   }
 
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(typeid(T)));
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor &ShareDataWith(const Tensor &src) {
+    src.check_memory_size();
+    if (holder_.get() != src.holder_.get()) {
+      *this = src;
+    }
+    return *this;
   }
 
   inline void *mutable_data(std::type_index type) {
@@ -149,6 +83,16 @@ class Tensor {
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T *mutable_data() {
+    static_assert(std::is_pod<T>::value, "T must be POD");
+    return reinterpret_cast<T *>(mutable_data(typeid(T)));
+  }
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -164,27 +108,6 @@ class Tensor {
     return mutable_data<T>();
   }
 
-  /*! Return the dimensions of the memory block. */
-  inline const DDim &dims() const { return dims_; }
-
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const { return product(dims_); }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      *this = src;
-    }
-    return *this;
-  }
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -218,44 +141,35 @@ class Tensor {
     }
   }
 
-  std::type_index type() const {
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T *data() {
+    check_memory_size();
     PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor not initialized yet when Tensor::type() is called.")
-    return holder_->type();
-  }
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s",
+        this->holder_->type().name());
 
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const {
-    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
   }
 
-  inline void check_memory_size() const {
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T *data() const {
+    check_memory_size();
     PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
-                          "Tensor's dims_ is out of bound. ");
+        (std::is_same<T, void>::value ||
+         holder_->type().hash_code() == typeid(T).hash_code()),
+        "Tensor holds the wrong type, it holds %s ,requested:%s",
+        this->holder_->type().name(), typeid(T).name());
+
+    return reinterpret_cast<const T *>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
   }
 
  private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a
-   * template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-
-    virtual void *ptr() const = 0;
-
-    virtual size_t size() const = 0;
-
-    virtual std::type_index type() const = 0;
-
-    virtual void set_type(std::type_index type) = 0;
-  };
-
   struct PlaceholderImpl : public Placeholder {
     PlaceholderImpl(size_t size, std::type_index type)
         : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
@@ -283,27 +197,6 @@ class Tensor {
     std::type_index type_;
   };
 
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really
-   * begins.
-   */
-  size_t offset_;
-
 #ifdef PADDLE_MOBILE_FPGA
  public:  // NOLINT
   inline void reset_data_ptr(void *p) {
diff --git a/src/framework/tensor_base.h b/src/framework/tensor_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1539d2e681973b39eeca5b30e2ed35b535be8cb
--- /dev/null
+++ b/src/framework/tensor_base.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <type_traits>
+#include <typeindex>
+
+#include "common/enforce.h"
+#include "common/types.h"
+#include "framework/ddim.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename... T>
+struct SizeOfTypeFunctor;
+
+template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
+      functor;
+  size_t size = functor(type);
+
+  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
+class TensorBase {
+ public:
+  virtual inline TensorBase &Resize(const DDim &dims) = 0;
+
+  inline bool IsInitialized() const { return holder_ != nullptr; }
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim &dims() const { return dims_; }
+
+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const { return product(dims_); }
+
+  std::type_index type() const {
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor not initialized yet when Tensor::type() is called.")
+    return holder_->type();
+  }
+
+  // memory size returns the holding memory size in byte.
+  size_t memory_size() const {
+    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+  }
+
+  inline void check_memory_size() const {
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
+                          "Tensor's dims_ is out of bound. ");
+  }
+
+ protected:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a
+   * template
+   *          parameter of Variable.
+   */
+  struct Placeholder {
+    virtual ~Placeholder() = default;
+
+    virtual void *ptr() const = 0;
+
+    virtual size_t size() const = 0;
+
+    virtual std::type_index type() const = 0;
+
+    virtual void set_type(std::type_index type) = 0;
+  };
+
+  /**
+   * @brief points to elements dimensions.
+   *
+   * @note dims_ do not indicate the memory block size.
+   */
+
+  DDim dims_;
+
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really
+   * begins.
+   */
+  size_t offset_ = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 6a7dff597af7fa5de06c90304136e81390fe06af..8088f0b8c9f600ce2422af500ab66a68e1341fc8 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
   paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_->SetCLPath(config.cl_path);
+#endif
   if (config.memory_pack.from_memory) {
     DLOG << "load from memory!";
     paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
@@ -50,7 +52,6 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
   paddle_mobile_->SetThreadNum(config.thread_num);
   return true;
 }
-
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Run(
     const std::vector<PaddleTensor> &inputs,
@@ -126,6 +127,8 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
       x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
     } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
       x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
+      x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
     } else {
       LOG(kLOG_ERROR) << "unsupport device type!";
       return nullptr;
diff --git a/src/ios_io/PaddleMobileCPU.h b/src/io/ios_io/PaddleMobileCPU.h
similarity index 100%
rename from src/ios_io/PaddleMobileCPU.h
rename to src/io/ios_io/PaddleMobileCPU.h
diff --git a/src/ios_io/PaddleMobileCPU.mm b/src/io/ios_io/PaddleMobileCPU.mm
similarity index 100%
rename from src/ios_io/PaddleMobileCPU.mm
rename to src/io/ios_io/PaddleMobileCPU.mm
diff --git a/src/jni/PML.java b/src/io/jni/PML.java
similarity index 100%
rename from src/jni/PML.java
rename to src/io/jni/PML.java
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/io/jni/paddle_mobile_jni.cpp
similarity index 100%
rename from src/jni/paddle_mobile_jni.cpp
rename to src/io/jni/paddle_mobile_jni.cpp
diff --git a/src/jni/paddle_mobile_jni.h b/src/io/jni/paddle_mobile_jni.h
similarity index 100%
rename from src/jni/paddle_mobile_jni.h
rename to src/io/jni/paddle_mobile_jni.h
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index 16756a61bf3265a0b6d7c2ec731d2c3d17bf9c3c..5326f864a4b5238c8498ee1fe9e5810ca0a657cf 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -44,7 +44,7 @@ class PaddleBuf {
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
   // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
   // Resize to `length` bytes.
   void Resize(size_t length);
@@ -98,7 +98,6 @@ class PaddlePredictor {
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
-
   // Destroy the Predictor.
   virtual ~PaddlePredictor() = default;
 
@@ -121,7 +120,7 @@ struct PaddleModelMemoryPack {
 
 struct PaddleMobileConfig : public PaddlePredictor::Config {
   enum Precision { FP32 = 0 };
-  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 };
 
   enum Precision precision;
   enum Device device;
@@ -132,6 +131,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
   int thread_num = 1;
   std::string prog_file;
   std::string param_file;
+  std::string cl_path;
   struct PaddleModelMemoryPack memory_pack;
 };
 
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index ec1fd1af45319192585f60fa1f90500fa2deaf46..8e4a72dcadf1bc5105e1fc5f9b8c96bfb6d9aa3d 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "io/paddle_mobile.h"
-
+#ifdef PADDLE_MOBILE_CL
+#include <CL/cl.h>
+#include "framework/cl/cl_tensor.h"
+#endif
+#include "common/common.h"
+#include "operators/math/gemm.h"
 namespace paddle_mobile {
 
 template <typename Dtype, Precision P>
@@ -28,13 +33,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
                                   bool quantification, int batch_size,
                                   bool loddable) {
   if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
   } else {
     LOG(kLOG_INFO) << "loader inited";
   }
 
   if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
         loader_->Load(dirname, optimize, quantification), batch_size, optimize,
         loddable);
   } else {
@@ -50,13 +55,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                   bool quantification, int batch_size,
                                   bool loddable) {
   if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
   } else {
     LOG(kLOG_INFO) << "loader inited";
   }
 
   if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
         loader_->Load(model_path, para_path, optimize, quantification),
         batch_size, optimize, loddable);
   } else {
@@ -67,21 +72,22 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
 }
 
 template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf) {
+bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
+                                                const uint8_t *model_buf,
+                                                size_t combined_params_len,
+                                                uint8_t *combined_params_buf) {
   int batch_size = 1;
   bool optimise = true;
   bool quantification = false;
 
   if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
   } else {
     LOG(kLOG_INFO) << "loader inited";
   }
 
   if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
         loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                     combined_params_buf, optimise,
                                     quantification),
@@ -117,6 +123,40 @@ void PaddleMobile<Dtype, P>::Clear() {
   loader_ = nullptr;
 }
 
+template <typename Dtype, Precision P>
+double PaddleMobile<Dtype, P>::GetPredictTime() {
+  int m = 32;
+  int n = 224 * 224;
+  int k = 27;
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  float *a =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  int t1 = 1;
+  int t2 = 1;
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = t1 + rand() % t2;
+  }
+  paddle_mobile::operators::math::Gemm gemm;
+  auto time1 = paddle_mobile::time();
+  //  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+  //             static_cast<float>(0), c, ldc, false, nullptr);
+  auto time2 = paddle_mobile::time();
+  double cost = paddle_mobile::time_diff(time1, time2);
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  return cost;
+}
+
 template <typename Dtype, Precision P>
 PaddleMobile<Dtype, P>::~PaddleMobile() {
   executor_ = nullptr;
@@ -157,8 +197,223 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 }
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+static std::mutex lc;
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+  std::lock_guard<std::mutex> lock(lc);
+  if (framework::CLEngine::Instance()->GetCLPath() == "") {
+    framework::CLEngine::Instance()->setClPath(path);
+  }
+}
+template <>
+double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
+  cl_int status;
+  cl_uint nPlatform;
+  clGetPlatformIDs(0, NULL, &nPlatform);
+  cl_platform_id *listPlatform =
+      (cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id));
+  clGetPlatformIDs(nPlatform, listPlatform, NULL);
+  cl_uint nDevice = 0;
+  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
+  cl_device_id *listDevice =
+      (cl_device_id *)malloc(nDevice * sizeof(cl_device_id));
+  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice,
+                 NULL);
+  cl_context context =
+      clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
+  cl_command_queue queue =
+      clCreateCommandQueue(context, listDevice[0], 0, &status);
+
+  int n = 1;
+  int c = 3;
+  int h = 224;
+  int w = 224;
+  float *input = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224));
+  float *filter = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27));
+  int input_w = w * (c + 3) / 4;
+  int input_h = n * h;
+  int filter_w = 3 * (3 + 3) / 4;
+  int filter_h = 32 * 3;
+  int output_w = 224 * (32 + 3) / 4;
+  int output_h = 1 * 224;
+
+  framework::DDim input_dims = {1, 3, 224, 224};
+  framework::CLTensor input_cl_tensor(context, queue);
+  input_cl_tensor.Resize(input_dims);
+  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input);
+
+  framework::DDim filter_dims = {32, 3, 3, 3};
+  framework::CLTensor filter_cl_tensor(context, queue);
+  input_cl_tensor.Resize(filter_dims);
+  cl_mem filterBuffer = filter_cl_tensor.mutable_with_data<float>(filter);
+
+  cl_mem cl_filter_image = NULL;
+  cl_mem cl_input_image = NULL;
+  cl_mem cl_output_image = NULL;
+  cl_image_format cf = {.image_channel_order = CL_RGBA,
+                        .image_channel_data_type = CL_HALF_FLOAT};
+  cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w,
+                                   input_h, 0, NULL, &status);
+  cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
+                                    filter_w, filter_h, 0, NULL, &status);
+  cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
+                                    output_w, output_h, 0, NULL, &status);
+  char *code;
+  std::string path = framework::CLEngine::Instance()->GetCLPath() +
+                     "/cl_kernel/feed_kernel.cl";
+  size_t length = readText(path.c_str(), &code);
+  cl_program program = clCreateProgramWithSource(
+      context, 1, (const char **)&code, &length, NULL);
+  std::string path1 = "-cl-fast-relaxed-math -I " +
+                      framework::CLEngine::Instance()->GetCLPath() +
+                      "/cl_kernel";
+  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
+  cl_kernel kernel = clCreateKernel(program, "feed", &status);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size[2] = {input_w, input_h};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size1[2] = {filter_w, filter_h};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  clFinish(queue);
+  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
+
+  path = framework::CLEngine::Instance()->GetCLPath() +
+         "/cl_kernel/conv_kernel.cl";
+  size_t length1 = readText(path.c_str(), &code);
+  program = clCreateProgramWithSource(context, 1, (const char **)&code,
+                                      &length1, &status);
+  CL_CHECK_ERRORS(status);
+  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
+  kernel = clCreateKernel(program, "conv_3x3", &status);
+  CL_CHECK_ERRORS(status);
+
+  int c_block = (32 + 3) / 4;
+  int nh = n * h;
+  int stride = 1;
+  int offset = 0;
+  int input_c = (c + 3) / 4;
+  int dilation = 1;
+  int input_width = 224;
+  int input_height = 224;
+  int output_width = 224;
+  int output_height = 224;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  size_t global_work_size2[3] = {8, 224, 224};
+  auto time1 = paddle_mobile::time();
+  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
+                                  NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+  clFinish(queue);
+  auto time2 = paddle_mobile::time();
+  paddle_mobile::memory::Free(input);
+  paddle_mobile::memory::Free(filter);
+  return paddle_mobile::time_diff(time1, time2);
+}
+template <typename Dtype, Precision P>
+int PaddleMobile<Dtype, P>::readText(
+    const char *kernelPath,
+    char **pcode)  // 读取文本文件放入 pcode，返回字符串长度
+{
+  FILE *fp;
+  int size;
+  // printf("<readText> File: %s\n", kernelPath);
+  fp = fopen(kernelPath, "rb");
+  if (!fp) {
+    printf("<readText> Open file failed\n");
+    return -1;
+  }
+  if (fseek(fp, 0, SEEK_END) != 0) {
+    printf("<readText> Seek end of file failed\n");
+    return -1;
+  }
+  if ((size = ftell(fp)) < 0) {
+    printf("<readText> Get file position failed\n");
+    return -1;
+  }
+  rewind(fp);
+  if ((*pcode = (char *)malloc(size + 1)) == NULL) {
+    printf("<readText> Allocate space failed\n");
+    return -1;
+  }
+  fread(*pcode, 1, size, fp);
+  (*pcode)[size] = '\0';
+  fclose(fp);
+  return size + 1;
+}
+
+#endif
+
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
 
+template class PaddleMobile<GPU_CL, Precision::FP32>;
+
 }  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index e0ff51d246b179e3f91e1c94f3b26c5ff9ba3d8f..ab148e7361c160bc658403d4696b806323595c54 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -22,10 +22,13 @@ limitations under the License. */
 #endif  // _OPENMP
 
 #include "common/types.h"
+#include "framework/executor.h"
 #include "framework/load_ops.h"
+#include "framework/loader.h"
 #include "framework/tensor.h"
-#include "io/executor.h"
-#include "io/loader.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_engine.h"
+#endif
 
 namespace paddle_mobile {
 
@@ -34,7 +37,13 @@ class PaddleMobile {
   typedef typename PrecisionTrait<P>::ptype Ptype;
 
  public:
-  PaddleMobile() {}
+  PaddleMobile() {
+#ifndef PADDLE_MOBILE_CL
+    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Dtype>::value;
+    PADDLE_MOBILE_ENFORCE(!is_gpu,
+                          "Not Enable GPU in CmakeList but run gpu codes ");
+#endif
+  }
   bool Load(const std::string &dirname, bool optimize = false,
             bool quantification = false, int batch_size = 1,
             bool loddable = false);
@@ -52,10 +61,11 @@ class PaddleMobile {
 
   bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                           size_t combined_params_len,
-                          const uint8_t *combined_params_buf);
+                          uint8_t *combined_params_buf);
 
   void SetThreadNum(int num);
   void Clear();
+  double GetPredictTime();
 
   ~PaddleMobile();
 
@@ -68,9 +78,16 @@ class PaddleMobile {
   void Predict_To(int end);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+ public:
+  void SetCLPath(std::string cl_path);
+  int readText(const char *kernelPath,
+               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
+#endif
+
  private:
-  std::shared_ptr<Loader<Dtype, P>> loader_;
-  std::shared_ptr<Executor<Dtype, P>> executor_;
+  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
+  std::shared_ptr<framework::Executor<Dtype, P>> executor_;
 };
 
 }  // namespace paddle_mobile
diff --git a/src/io/paddle_test_inference_api.cpp b/src/io/paddle_test_inference_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97410ff32e31298bfd35abcc7dfc8cef61fe017a
--- /dev/null
+++ b/src/io/paddle_test_inference_api.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/paddle_test_inference_api.h"
+#include "io/paddle_mobile.h"
+namespace paddle_mobile {
+template <typename Dtype, Precision P>
+double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
+  PaddleMobile<Dtype, P> paddle_mobile;
+#ifdef PADDLE_MOBILE_CL
+  if (cl_path) {
+    paddle_mobile.SetCLPath(*cl_path);
+  }
+
+#endif
+  return paddle_mobile.GetPredictTime();
+}
+template class PaddleTester<CPU, Precision::FP32>;
+template class PaddleTester<FPGA, Precision::FP32>;
+template class PaddleTester<GPU_MALI, Precision::FP32>;
+
+template class PaddleTester<GPU_CL, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_test_inference_api.h b/src/io/paddle_test_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..b203bac43d17cafd7655911df5a5116b215413bd
--- /dev/null
+++ b/src/io/paddle_test_inference_api.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+#include "common/types.h"
+#include "string"
+namespace paddle_mobile {
+template <typename Dtype, Precision P = Precision::FP32>
+class PaddleTester {
+ public:
+  double CaculatePredictTime(std::string *cl_path = nullptr);
+};
+
+}  // namespace paddle_mobile
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9..2fb74d18809f174810866a990396bb0279d256f5 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,12 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
 
-#ifdef PADDLE_MOBILE_FPGA
-
-#include "fpga/api.h"
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
 
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
 #endif
 
 namespace paddle_mobile {
@@ -30,7 +32,7 @@ const int MALLOC_ALIGN = 64;
 namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
+  fpga::fpga_copy(dst, src, num);
 }
 
 void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index f820908404ea637d9680c32d5c4b5568e191dd7e..89220dd2489c93a84bc8a141c06a151b8044a4e4 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef BATCHNORM_OP
 
-#include "batchnorm_op.h"
+#include "operators/batchnorm_op.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 
@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp);
+#endif
+
 #endif
diff --git a/src/operators/bilinear_interp_op.h b/src/operators/bilinear_interp_op.h
index 1b17406c546d336fd42b0a818d16627c87aedb09..2bb61d129d5ba45900f1c67b8c202e958a004bb7 100644
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -40,10 +40,6 @@ class BilinearOp : public framework::OperatorWithKernel<
             DeviceType, BilinearInterpParam<DeviceType>,
             operators::BilinearInterpKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, BilinearInterpParam<DeviceType>,
-      operators::BilinearInterpKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index c06ca8265dd495acb79e4e2ec6c497941b822b21..3a3048c6624996892333a71773c33ee2f6e18e0a 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -39,10 +39,6 @@ class BoxCoderOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                       operators::BoxCoderKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, BoxCoderParam<DeviceType>,
-      operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index eb257d47228ab854c00574a001f6454e239cfbbd..a01e066edd1082bc109ba7eb0f31a2ac42ab865a 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -34,10 +34,6 @@ class ConcatOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                       operators::ConcatKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ConcatParam<DeviceType>,
-      operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index c4601995219b32db75f22c7c2ed959e18af85f36..2c70f42f56530c2d21252d6b51c228e7c49ca8bf 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -62,4 +62,8 @@ REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(conv2d, ops::ConvOp);
+#endif
+
 #endif
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index 23c022e584f9be6cb0b4c2c416ca96e61b3c131f..1b8bd70805ccff8946c1ab12a207618849fc9ca4 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -34,10 +34,6 @@ class ConvOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                       operators::ConvKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ConvParam<DeviceType>,
-      operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  private:
diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp
index 4d9eefaa85be51c9c2409ca044a6da4874566e1c..d09a7937453f3bd2c20d9e6bc1a03d4375d57491 100644
--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif
 
 #endif
diff --git a/src/operators/crf_op.h b/src/operators/crf_op.h
index 9b7487ee958467dac451c3bcb743e6122842c7f1..dca481bb2dd08dc65fb94e41d0573277c9b143c7 100644
--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -37,10 +37,6 @@ class CrfOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
                                       operators::CrfKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, CrfParam<DeviceType>,
-      operators::CrfKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index 845c59a19e613bfcf299b445b778eff4d99c7295..102d65670d3e50acd15745e95b85d7b843994ed7 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -36,10 +36,6 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
             DeviceType, ConvParam<DeviceType>,
             operators::DepthwiseConvKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ConvParam<DeviceType>,
-      operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  private:
diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h
index 65f3587c2336b3e581a30328c41ad397b2848b34..ce8acd5966439808f7a03f18cf3d29a1b5c0487e 100644
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -38,10 +38,6 @@ class DropoutOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                       operators::DropoutKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  // using framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
-  //                                    operators::DropoutKernel<DeviceType,
-  //                                    T>>;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 93e447d51f0e9ce2fdf75c60332ad52950d68c3d..281cd3d5084a1a15502e1e06865e1024d3b2b639 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef ELEMENTWISEADD_OP
 
-#include "elementwise_add_op.h"
+#include "operators/elementwise_add_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,4 +36,12 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
+#endif
+
 #endif
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index a1360eba5480a46395cedb445a4df4e4ca0ab279..a853b40ff7ccf323911f2ea1bf6e23d67d111db2 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -37,10 +37,6 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
             DeviceType, ElementwiseAddParam<DeviceType>,
             operators::ElementwiseAddKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseAddParam<DeviceType>,
-      operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index 41f9e687bb4024d245a89df3dc785e1254b5a9a7..c3211b9fa9cc4b973788af4104c7ebe7bea2f54f 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,6 +14,19 @@ limitations under the License. */
 
 #include "operators/feed_op.h"
 
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void FeedOp<DeviceType, T>::InferShape() const {
+  auto out_dims = this->param_.Out()->dims();
+  out_dims[0] = this->param_.BatchSize();
+  this->param_.Out()->Resize(out_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
 namespace ops = paddle_mobile::operators;
 
 #ifdef PADDLE_MOBILE_CPU
@@ -25,3 +38,6 @@ REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(feed, ops::FeedOp);
+#endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index c7e77fcca40a3c533e442d10604c8cd9bcc1e74b..57932474184fd5431e5b6ac5756ab28faa2b1b9e 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -16,68 +16,29 @@ limitations under the License. */
 
 #include <string>
 #include "framework/operator.h"
+#include "operators/kernel/feed_kernel.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+using std::string;
+
 template <typename DeviceType, typename T>
-class FeedOp : public framework::OperatorBase<DeviceType> {
+class FeedOp
+    : public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
+                                           FeedKernel<DeviceType, T>> {
  public:
   FeedOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope.get()) {}
-
-  void InferShape() const {
-    auto out_dims = param_.Out()->dims();
-    out_dims[0] = param_.BatchSize();
-    param_.Out()->Resize(out_dims);
-  }
-
-#ifdef PADDLE_MOBILE_FPGA
-
-  void Init() {
-    Tensor *output = param_.Out();
-    fpga::format_fp16_ofm(output);
-  }
-
-  void RunImpl() const {
-    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());  // NOLINT
-    fpga::format_image(input);
-    auto input_ptr = input->data<float>();
-    Tensor *output = param_.Out();
-    auto output_ptr = output->data<float>();
-
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = (void *)input_ptr;  // NOLINT
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-  }
 
-#else
-  void Init() {}
-  void RunImpl() const {
-    param_.Out()->ShareDataWith(*param_.InputX());
-    param_.Out()->set_lod(param_.InputX()->lod());
-  }
-#endif
+      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
+                                      FeedKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
 
  protected:
-  FeedParam<DeviceType> param_;
 };
 
 }  // namespace operators
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index 6c5d1341db12db5e602bad08aaa33f26b2ac3396..50e53c30cfd06a8fae8c9e18dd4aa985a056a13e 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/fetch_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void FetchOp<DeviceType, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
@@ -24,3 +35,6 @@ REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fetch, ops::FetchOp);
+#endif
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index 9fbfc2f417b52162950612beb2979fe640cbdcc4..f92c66a05f121b3f6b78c244dd01d81393fa5c68 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "framework/operator.h"
+#include "operators/kernel/fetch_kernel.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -23,25 +24,20 @@ namespace operators {
 using std::string;
 
 template <typename DeviceType, typename T>
-class FetchOp : public framework::OperatorBase<DeviceType> {
+class FetchOp
+    : public framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
+                                           FetchKernel<DeviceType, T>> {
  public:
   FetchOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap attrs,
           std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
+                                      FetchKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  void Init() {}
-
-  void InferShape() const {
-    auto x_dims = param_.InputX()->dims();
-    param_.Out()->Resize(x_dims);
-  }
+  void InferShape() const override;
 
  protected:
-  FetchParam<DeviceType> param_;
 };
 
 }  // namespace operators
diff --git a/src/operators/fill_constant_op.cpp b/src/operators/fill_constant_op.cpp
index 6d7c4f44f1b769c47d6f741d139118158292a40f..0c13c57ceb53933c750f8c1adaa8b4e24ff948c8 100644
--- a/src/operators/fill_constant_op.cpp
+++ b/src/operators/fill_constant_op.cpp
@@ -20,9 +20,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
 #endif
diff --git a/src/operators/fill_constant_op.h b/src/operators/fill_constant_op.h
index 78eb162efc8ccd42b9fba363d49d1dbc4052f6b2..e24cecd363630a845f147e2e429b973dad24f63d 100644
--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -37,7 +37,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
         param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const {
+  void RunImpl() {
     auto data_type =
         static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
             param_.DataDtype());
diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h
index e935ae308cf5c28b9c435086b2b5e4d4407c319a..a7a91e60701cf559cb35238aa2966c02c869e844 100644
--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
 #include "operators/kernel/flatten_kernel.h"
@@ -53,10 +54,6 @@ class FlattenOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
                                       operators::FlattenKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FlattenParam<DeviceType>,
-      operators::FlattenKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h
index 7893ff95a671447adbeebeeaf4096235e7a37964..4ec76b500812f95eb64e27564d0e63b2c1b2c2d3 100644
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <utility>
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
@@ -67,10 +68,6 @@ class FusionConvAddAddPReluOp
             DeviceType, FusionConvAddAddPReluParam<DeviceType>,
             operators::ConvAddAddPReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-      operators::ConvAddAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
index e7d6ee59f2dadbdca0af72af1e786f0430c58d63..b9bc948fe0e77741a36f959e29eb2a4c82e82b72 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -58,5 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
-
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
+#endif
 #endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
index 07bb0146b3f481e09d0a944c4791237e7eea08e4..6ecc9bdc4a90530221c70651c52457874e3eaaa8 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -66,10 +66,6 @@ class FusionConvAddBNReluOp
             DeviceType, FusionConvAddBNReluParam<DeviceType>,
             operators::ConvAddBNReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddBNReluParam<DeviceType>,
-      operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp
index 485ba1be9baee2034dbd5c47f64372b701026e44..1b32ec39b65f8b16fd8967be3f45f4b31db5ca16 100644
--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -58,4 +58,8 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_add_op.h b/src/operators/fusion_conv_add_op.h
index 365e3afa97c2c2fd82c629302f8a5fddf8abb406..eef143ce8716ce856784bb01dd3d58a26746b4e8 100644
--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -56,10 +56,6 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
                                       FusionConvAddParam<DeviceType>,
                                       operators::ConvAddKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddParam<DeviceType>,
-      operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h
index 0b0763e781daf3d882d0463205b07fdef53b90f5..fc1143099e16b8b7f7c44d7fe5a5694a278a1906 100644
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -39,10 +39,7 @@ class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
-
-                 },
-
+                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
                  removed_nodes);
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
@@ -63,9 +60,6 @@ class FusionConvAddPReluOp
             operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                           attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddPReluParam<DeviceType>,
-      operators::ConvAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 486221f0f6b2e1b0d78d2632c8d735a6a6a101bb..bb4b6666a881de0989d43840806b9d5d720b3b66 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 1335ce7b6ca5151e3d396856055f38825710f4b1..22ba67c617ecdb0f3be2f5757504b6ba530b092c 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -29,9 +29,8 @@ namespace operators {
 class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  public:
   FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+    //    node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD);
+    //    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
   void FolderNodes(
@@ -57,9 +56,6 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
             operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddReluParam<DeviceType>,
-      operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_bn_add_relu_op.h b/src/operators/fusion_conv_bn_add_relu_op.h
index b2f911363acc4f9d5b3c4407317107efadf3996d..303668a89bf7869e72a4b546c5d96be24b26c4ec 100644
--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -17,11 +17,12 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <utility>
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_bn_add_relu_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -71,10 +72,6 @@ class FusionConvBNAddReluOp
             DeviceType, FusionConvBNAddReluParam<DeviceType>,
             operators::ConvBNAddReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvBNAddReluParam<DeviceType>,
-      operators::ConvBNAddReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h
index a6bbe72500ccfe2b43e21496c5abc18b9a562d47..9bc534fe333c76e8f533c904560b8228760c66e5 100644
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -63,10 +63,6 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
             DeviceType, FusionConvBNReluParam<DeviceType>,
             operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                         attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvBNReluParam<DeviceType>,
-      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_deconv_relu_op.cpp b/src/operators/fusion_deconv_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..daae39c951b34fa05962f936c28381f7d5d4e15c
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/fusion_deconv_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_deconv_relu_op.h b/src/operators/fusion_deconv_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e87d5d3798930d745b82c8e5a3cca793c12ee4b1
--- /dev/null
+++ b/src/operators/fusion_deconv_relu_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DECONVRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/deconv_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDeconvReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDeconvReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionDeconvReluParam<DeviceType>,
+                               operators::DeconvReluKernel<DeviceType, T>> {
+ public:
+  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDeconvReluParam<DeviceType>,
+            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h
index 44a1f845bc9b2dc0251fb729de9f9c00071fd492..d7a74d896e904971e21c28fab29771b34a049921 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -65,9 +65,6 @@ class FusionDWConvBNReluOp
             operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                           attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, FusionDWConvBNReluParam<DeviceType>,
-      operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 722c5225bc035df2761154a08a521a09b34a1e82..26cb40aac8e47203f125417e1f6b5df75d7835b5 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -56,10 +56,6 @@ class FusionFcOp : public framework::OperatorWithKernel<
                                       operators::FusionFcKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, FusionFcParam<DeviceType>,
-      operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
-
   void InferShape() const override;
 };
 
diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h
index 5cd884f04e819ac881c3b2a4ad666591ea610117..7324f94138e59c4a4a93fe2658b38ddbdf6fa651 100644
--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -56,9 +56,6 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
             operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                           attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, FusionFcReluParam<DeviceType>,
-      operators::FusionFcReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/gru_op.h b/src/operators/gru_op.h
index a45d3efe5b4c59f8582c534f85de7cc1ac82df85..5e66b497af15c498e2af5ff5903ef88a16db1832 100644
--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -37,10 +37,6 @@ class GruOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
                                       operators::GruKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, GruParam<DeviceType>,
-      operators::GruKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h
index 50d5664c1a3ce999a0c163225d20126961804a22..036b496ca8293432aa30ae86542e78880143f086 100644
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -16,15 +16,14 @@ limitations under the License. */
 
 #pragma once
 
-#include <operators/op_param.h>
+#include <string>
 #include "framework/operator.h"
 #include "operators/kernel/im2sequence_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class Im2SequenceOp : public framework::OperatorWithKernel<
                           DeviceType, Im2SequenceParam<DeviceType>,
@@ -39,9 +38,6 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
             operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
 
-  // using framework::OperatorWithKernel<
-  //    DeviceType, Im2SequenceParam<DeviceType>,
-  //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  private:
diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp
index c420727f425092240994ee834117225c72abeec2..f31c4426db7d28234692742fcd670cb26ec50ab0 100644
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -26,8 +26,7 @@ bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
 }
 
 template <>
-void BatchNormKernel<CPU, float>::Compute(
-    const BatchNormParam<CPU> &param) const {
+void BatchNormKernel<CPU, float>::Compute(const BatchNormParam<CPU> &param) {
   BatchnormCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
index 4888f7a37a47fe80ffcbaee7e3f80b1d5c1f20f4..85192e28edf8351bd8be540b27aa986b2c458d0d 100644
--- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
@@ -27,7 +27,7 @@ bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
 
 template <>
 void BilinearInterpKernel<CPU, float>::Compute(
-    const BilinearInterpParam<CPU> &param) const {
+    const BilinearInterpParam<CPU> &param) {
   BilinearInterpCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp
index b769d4fbbaa7570ee741476f960d9e5b60c61917..30ede12dffe0eed7673c9ae1f7c836fd1b5b7096 100644
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -26,8 +26,7 @@ bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
 }
 
 template <>
-void BoxCoderKernel<CPU, float>::Compute(
-    const BoxCoderParam<CPU> &param) const {
+void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam<CPU> &param) {
   BoxCoderCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp
index 04c590e6b432fbf88cd136eac942485adf9a9003..8cdf6cb01afa85239bfd0d48bbce02790ba5250d 100644
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -26,7 +26,7 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
 }
 
 template <>
-void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) const {
+void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
   ConcatCompute<float>(param);
   param.Out()->set_lod(param.Inputs()[0]->lod());
 }
diff --git a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
index 74b88f5d4f5e24b1401803c8c48d99319f412d1b..2f6f5f3ac719b3fd32aac54ce36eb534f7d99dd7 100644
--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
@@ -28,7 +28,7 @@ bool ConvAddAddPReluKernel<CPU, float>::Init(
 
 template <>
 void ConvAddAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddAddPReluParam<CPU> &param) const {
+    const FusionConvAddAddPReluParam<CPU> &param) {
   ConvAddAddPReluCompute<float>(param);
 }
 template class ConvAddAddPReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
index ca53ebea8e4577fdc52fad066691d4351eaf12f9..eb55920621db34d191a9536f287ec50747e1ce3c 100644
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -55,7 +55,7 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
 
 template <>
 void ConvAddBNReluKernel<CPU, float>::Compute(
-    const FusionConvAddBNReluParam<CPU> &param) const {
+    const FusionConvAddBNReluParam<CPU> &param) {
   ConvAddBNReluCompute<float>(param);
 }
 template class ConvAddBNReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp
index 1af1c3db1159cd4fed007ebf153ba15b804eee75..e016b8efbd15472ae0d77423d84dc19671bfa316 100644
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -25,8 +25,7 @@ bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
 }
 
 template <>
-void ConvAddKernel<CPU, float>::Compute(
-    const FusionConvAddParam<CPU> &param) const {
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
   ConvAddCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
index 5930cfdcfc0f983c9f07754113dc37d5122d19f0..f04a9a7d746f2d970196945707bd05409c5fa340 100644
--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
 
 template <>
 void ConvAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddPReluParam<CPU> &param) const {
+    const FusionConvAddPReluParam<CPU> &param) {
   ConvAddPReluCompute<float>(param);
 }
 template class ConvAddPReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
index f50e1e3900bb5fce35a29100d6c2cb6004b4af74..211d6d8487bfd4afc71d74e5ecbff149ad34e466 100644
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
 
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam<CPU> &param) const {
+    const FusionConvAddReluParam<CPU> &param) {
   ConvAddReluCompute<float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
index 785b13dde2ec1196792d17b253bb0d904da799f5..a0f21dd6126ed81cf5e96f99bd0f8ed5211f96a4 100644
--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
@@ -55,7 +55,7 @@ bool ConvBNAddReluKernel<CPU, float>::Init(
 
 template <>
 void ConvBNAddReluKernel<CPU, float>::Compute(
-    const FusionConvBNAddReluParam<CPU> &param) const {
+    const FusionConvBNAddReluParam<CPU> &param) {
   ConvBNAddReluCompute<float>(param);
 }
 template class ConvBNAddReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
index 6b9ea0428fa496980a234c7c895ef9cbf1245b51..d8acb8d2083b732da026a9bff19c2d7732568597 100644
--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -57,7 +57,7 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
 
 template <>
 void ConvBNReluKernel<CPU, float>::Compute(
-    const FusionConvBNReluParam<CPU> &param) const {
+    const FusionConvBNReluParam<CPU> &param) {
   ConvBNReluCompute<float>(param);
 }
 template class ConvBNReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index be518d3a2cac2f3a749a7bbbbd0c15a17cf2904c..93aaea4afd7026f792a007b337a35c2bde48ad48 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -55,7 +55,7 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 }
 
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
+void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_GEMM_INT8:
       GemmConv<int8_t, int32_t>(param);
diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp
index 94f8a79101ca4b1f4085a4d172fee761714dc3d2..771a846ed65e5c69090698ce813103077dedaccf 100644
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
 
 template <>
 void ConvTransposeKernel<CPU, float>::Compute(
-    const ConvTransposeParam<CPU> &param) const {
+    const ConvTransposeParam<CPU> &param) {
   ConvTransposeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/crf_kernel.cpp b/src/operators/kernel/arm/crf_kernel.cpp
index 89769c50a6fc05b28192ebf584ba3cb12f19ac2c..d30c28b3576e2a8a8a108ae6c86edc2f4310b83f 100644
--- a/src/operators/kernel/arm/crf_kernel.cpp
+++ b/src/operators/kernel/arm/crf_kernel.cpp
@@ -27,7 +27,7 @@ bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
 }
 
 template <>
-void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) const {
+void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) {
   CrfCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index fd5e068afb6f7f2a069a7d8fccc459d4c2a6828d..000d59baa8c804201cbd2e2a731c2077196b698f 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -26,8 +26,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 }
 
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(
-    const ConvParam<CPU> &param) const {
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
   DepthwiseConvCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp
index 64da460da1b90bcc9b16500b9562e270a4110f78..03122047f61c585c3955ca18243ab849fb498728 100644
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -29,8 +29,7 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
 }
 
 template <>
-void DequantizeKernel<CPU, float>::Compute(
-    const DequantizeParam<CPU> &param) const {
+void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
   const Tensor *input = param.input_;
   Tensor *output = param.out_;
   float activation_scale = param.activation_scale_->data<float>()[0];
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
index 4578ac6607d87c316853f6201f02f8204bc41de1..964773ad696ea53fccec62a394f00fa70daf7145 100644
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -27,7 +27,7 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
 
 template <typename T>
 struct DropoutFunctor {
-  DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
+  explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
   inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
 
  private:
@@ -35,7 +35,7 @@ struct DropoutFunctor {
 };
 
 template <>
-void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
+void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) {
   const auto *input_x = param.InputX();
   auto *input_x_ptr = input_x->data<float>();
   auto *out = param.Out();
diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
index b85701bb936b2ccc0323e4d534424abb726a69be..f92d9a273467bf15d9d7fad43237af5385d3d54e 100644
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
@@ -54,7 +54,7 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
 
 template <>
 void DWConvBNReluKernel<CPU, float>::Compute(
-    const FusionDWConvBNReluParam<CPU> &param) const {
+    const FusionDWConvBNReluParam<CPU> &param) {
   DWConvBNReluCompute<float>(param);
 }
 template class DWConvBNReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp
index 9c6f4a3316385b803a8fdb833490f1fe9e7f41ac..043d27e72f16ab4b38f31d6cff60bd2f4e89a649 100644
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
 
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
-    const ElementwiseAddParam<CPU> &param) const {
+    const ElementwiseAddParam<CPU> &param) {
   ElementwiseAddCompute<float>(param);
   param.Out()->set_lod(param.InputX()->lod());
 }
diff --git a/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
index 00205952a2567aae5927e318c494c90bc4a5ffbb..9c245707da31d07e2419439c68343f7014beb416 100644
--- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
 
 template <>
 void ElementwiseMulKernel<CPU, float>::Compute(
-    const ElementwiseMulParam<CPU> &param) const {
+    const ElementwiseMulParam<CPU> &param) {
   ElementwiseMulCompute<float>(param);
   param.Out()->set_lod(param.InputX()->lod());
 }
diff --git a/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
index d78b3e31098ef7ef929a0d2c00043fab7193b01c..30f607155c4a91f4f523c6596f09c2379970108c 100644
--- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
 
 template <>
 void ElementwiseSubKernel<CPU, float>::Compute(
-    const ElementwiseSubParam<CPU> &param) const {
+    const ElementwiseSubParam<CPU> &param) {
   ElementwiseSubCompute<float>(param);
   param.Out()->set_lod(param.InputX()->lod());
 }
diff --git a/src/operators/kernel/arm/feed_kernel.cpp b/src/operators/kernel/arm/feed_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..598f6df01b16683f4d6e06f6418a2930a7ec8736
--- /dev/null
+++ b/src/operators/kernel/arm/feed_kernel.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+template class FeedKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/fetch_kernel.cpp b/src/operators/kernel/arm/fetch_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c25514857dee9029afa3a7a80d5c89a97bbe9be
--- /dev/null
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/fetch_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
+  return true;
+}
+template <>
+void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+template class FetchKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/flatten_kernel.cpp b/src/operators/kernel/arm/flatten_kernel.cpp
index ef4fe913c4800526f46daa75760afe82fdbee591..4d00e494544557ce05f2af16bb59979ea2b8927f 100644
--- a/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
@@ -26,7 +26,7 @@ bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
 }
 
 template <>
-void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) const {
+void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) {
   FlattenCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp
index d9d112e7a762705efe041c74eea9ddb7d5162918..c503edab643def7af0585a18d774b14ca0a3c39d 100644
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -26,8 +26,7 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
 }
 
 template <>
-void FusionFcKernel<CPU, float>::Compute(
-    const FusionFcParam<CPU> &param) const {
+void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
   FusionFcCompute<float>(param);
   param.Out()->set_lod(param.InputX()->lod());
 }
diff --git a/src/operators/kernel/arm/gru_kernel.cpp b/src/operators/kernel/arm/gru_kernel.cpp
index 168471185e07a9c1814c708238996a82c1ee0891..a4e89ff42a3d70c0a9a3d1bd7316e18d015a0926 100644
--- a/src/operators/kernel/arm/gru_kernel.cpp
+++ b/src/operators/kernel/arm/gru_kernel.cpp
@@ -26,7 +26,7 @@ bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
 }
 
 template <>
-void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) const {
+void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
   GruCompute<float>(param);
   param.OutHidden()->set_lod(param.InputInput()->lod());
   //  DLOG << "________________" << param.OutHidden()->dims();
diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp
index 8295fd94a31db2ad1c10d32a8c639b067e422f45..07ce0314fa08467d4fc63bc0745a49b8a3b2f263 100644
--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -33,9 +33,9 @@ inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
 
 template <>
 void Im2SequenceKernel<CPU, float>::Compute(
-    const Im2SequenceParam<CPU> &param) const {
+    const Im2SequenceParam<CPU> &param) {
   const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
+  framework::LoDTensor *out = param.Output();
   out->mutable_data<float>();
 
   std::vector<int> kernels = param.Kernels();
@@ -52,22 +52,31 @@ void Im2SequenceKernel<CPU, float>::Compute(
                                        paddings[2], strides[0]);
   int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
                                       paddings[3], strides[1]);
-  const std::vector<int> dilations({1, 1});
 
-  // TODO: verify
+  out->mutable_data<float>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]});
+  const std::vector<int> dilations({1, 1});
+  // TODO(): verify
   auto out_dims = out->dims();
   out->Resize({batch_size, out->numel() / batch_size});
-
   for (int i = 0; i < batch_size; i++) {
     const Tensor src =
         in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
     Tensor dst = out->Slice(i, i + 1).Resize(
         {output_height, output_width, img_channels, kernels[0], kernels[1]});
-
     math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
     f(src, dilations, strides, paddings, &dst);
   }
   out->Resize(out_dims);
+  framework::LoD lod(1);
+  lod[0].reserve(batch_size + 1);
+  int offset = 0;
+  lod[0].push_back(offset);
+  for (int i = 0; i < batch_size; ++i) {
+    offset += output_height * output_width;
+    lod[0].push_back(offset);
+  }
+  out->set_lod(lod);
 }
 
 template class Im2SequenceKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/lookup_kernel.cpp b/src/operators/kernel/arm/lookup_kernel.cpp
index 584c497c701bd0598e0a151774fe60b7c7fee718..0e6df6ab6bf19f67b0c5f5a873d4a47215167e45 100644
--- a/src/operators/kernel/arm/lookup_kernel.cpp
+++ b/src/operators/kernel/arm/lookup_kernel.cpp
@@ -25,7 +25,7 @@ bool LookupKernel<CPU, float>::Init(LookupParam<CPU> *param) {
 }
 
 template <>
-void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) const {
+void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) {
   LookupCompute<float>(param);
   param.Out()->set_lod(param.InputIds()->lod());
 }
diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp
index 3ec1bdd9a0e2ebbce555eef944fe56750505430f..bf049053fc5b9157f24c50233742eea3c0ca2de1 100644
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -26,7 +26,7 @@ bool LrnKernel<CPU, float>::Init(LrnParam<CPU> *param) {
 }
 
 template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) const {
+void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) {
   LrnCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index 276281f963e449af9d55f7c5ca58ef5da17e6f93..59d16600d71d247c42bb7625a3dddd5952a33705 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -26,7 +26,7 @@ bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
 }
 
 template <>
-void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
+void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) {
   MulCompute<float>(param);
   param.Out()->set_lod(param.InputX()->lod());
 }
diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
index 938f81cf485eb64f408c0fb274eeec673349e306..61638da0051c7b27b695752c445f0fd6b20114b5 100644
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -27,7 +27,7 @@ bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam<CPU> *param) {
 
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam<CPU> &param) const {
+    const MultiClassNMSParam<CPU> &param) {
   MultiClassNMSCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
index e72c29135e9898d3b5342d1c4b4f0176f105a62a..1ae11aba41f1b2dbd9207e0808990a262bb80f56 100644
--- a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
@@ -28,7 +28,7 @@ bool PolygonBoxTransformKernel<CPU, float>::Init(
 
 template <>
 void PolygonBoxTransformKernel<CPU, float>::Compute(
-    const PolygonBoxTransformParam<CPU> &param) const {
+    const PolygonBoxTransformParam<CPU> &param) {
   PolygonBoxTransformCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp
index 60d6f1401876b957649d08889218b88cf1fe5eef..58d6359efa48b0db215269a631e7e4cb57c429d9 100644
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -25,7 +25,7 @@ bool PoolKernel<CPU, float>::Init(PoolParam<CPU> *param) {
 }
 
 template <>
-void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) const {
+void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) {
   PoolCompute<float>(param);
 }
 }  // namespace operators
diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp
index e1ec927fb13d1f4a2e600d46f65f2806448059d9..591bd644165f1a271a879073b27429d1780cbfb5 100644
--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
@@ -35,7 +35,7 @@ struct PReluFunctor {
  * @b 特化到具体平台的实现, param 从 op 层传入
  * */
 template <>
-void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) const {
+void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) {
   auto *x = param.InputX();
   auto *alpha = param.InputAlpha();
   auto *out = param.Out();
diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp
index 71011fa2112f36d573b5bdc55f1b5bf92318c448..c067d3388dd928b032178add99c6567a8add20d3 100644
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -26,8 +26,7 @@ bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam<CPU> *param) {
 }
 
 template <>
-void PriorBoxKernel<CPU, float>::Compute(
-    const PriorBoxParam<CPU> &param) const {
+void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam<CPU> &param) {
   PriorBoxCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp
index 11a1f0a53d4886e1a07d258b76b3827671471dca..17f442abe4e03d936eb3b317d5b6f164ac0924e7 100644
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -279,8 +279,7 @@ bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
 }
 
 template <>
-void QuantizeKernel<CPU, float>::Compute(
-    const QuantizeParam<CPU> &param) const {
+void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
   float max_abs = 0.f;
   const Tensor *input = param.input_;
   Tensor *output = param.out_;
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 6e04e6013aa8dd5c50dcc22a720b470b08ecd648..8ee103484eb753913e5554b64d6dac523066322a 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -26,7 +26,7 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
 }
 
 template <>
-void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) const {
+void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
   ReluCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/reshape2_kernel.cpp b/src/operators/kernel/arm/reshape2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..093105f906da2287015417ec05b709aebd4a1fb2
--- /dev/null
+++ b/src/operators/kernel/arm/reshape2_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/kernel/reshape2_kernel.h"
+#include "operators/kernel/central-arm-func/reshape2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Reshape2Kernel<CPU, float>::Init(Reshape2Param<CPU> *param) {
+  return true;
+}
+
+template <>
+void Reshape2Kernel<CPU, float>::Compute(const Reshape2Param<CPU> &param) {
+  Reshape2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp
index 235288ae13e2c557e6f7310727f5d8e6e83cedf6..800808f9c23cd07d17f8207b9b51e96d3feb34f3 100644
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -26,7 +26,7 @@ bool ReshapeKernel<CPU, float>::Init(ReshapeParam<CPU> *param) {
 }
 
 template <>
-void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) const {
+void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) {
   ReshapeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/resize_kernel.cpp b/src/operators/kernel/arm/resize_kernel.cpp
index 5c0c186554a31454447b1df47a1b7573fd948fb9..b53b7545e33c929fe0b55bccd68e7b955db0d676 100644
--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
@@ -108,7 +108,7 @@ void ResizeTensor(const Tensor* src, Tensor* dst) {
 }
 
 template <>
-void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) const {
+void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) {
   const auto* input_x = param.InputX();
   const auto& input_x_dims = input_x->dims();
   auto* out = param.Out();
diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp
index 299132ea00f40838249022c45d994e7d88547eaa..bded56275f80741c552d4978bb238d6f0d6339db 100644
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
  * @b 特化到具体平台的实现, param 从 op 层传入
  * */
 template <>
-void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) const {
+void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
   const auto *input_x = param.InputX();
   auto *input_x_ptr = input_x->data<float>();
   auto *out = param.Out();
diff --git a/src/operators/kernel/arm/shape_kernel.cpp b/src/operators/kernel/arm/shape_kernel.cpp
index 1687cfb4cdaf12eb2be9d465a83b82034b59f7cc..4adbf8fa1321c57330b480068ff1f7df7454d7e6 100644
--- a/src/operators/kernel/arm/shape_kernel.cpp
+++ b/src/operators/kernel/arm/shape_kernel.cpp
@@ -26,7 +26,7 @@ bool ShapeKernel<CPU, float>::Init(ShapeParam<CPU> *param) {
 }
 
 template <>
-void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) const {
+void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) {
   ShapeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp
index 7912fd8762b693cd40c632d6b152406ed4b0c568..3d6e14ffea80169172431229e34309cde331d588 100644
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -32,7 +32,7 @@ bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
 }
 
 template <>
-void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) const {
+void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) {
   SigmoidCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/slice_kernel.cpp b/src/operators/kernel/arm/slice_kernel.cpp
index 62efec9d2fb01568a108df8f3516085d81865bf7..e373b569870c81587377ac02e578397518513a85 100644
--- a/src/operators/kernel/arm/slice_kernel.cpp
+++ b/src/operators/kernel/arm/slice_kernel.cpp
@@ -17,6 +17,14 @@ limitations under the License. */
 #include "operators/kernel/slice_kernel.h"
 
 namespace paddle_mobile {
-namespace operators {}
+namespace operators {
+
+template <>
+bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
+  return true;
+}
+template <>
+void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {}
+}  // namespace operators
 }  // namespace paddle_mobile
 #endif
diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp
index f86a10601aa3a67300736f2f4c751c05bf41a781..d5a1009fd79d57d8815d313ed61bbc5d7bf32bbe 100644
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -26,7 +26,7 @@ bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam<CPU> *param) {
 }
 
 template <>
-void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) const {
+void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) {
   SoftmaxCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/split_kernel.cpp b/src/operators/kernel/arm/split_kernel.cpp
index d2ca34f764adc50154fb58e3a6248f9311bbface..13c7567e3db137f0c579ad0e33b1856aaf8334f2 100644
--- a/src/operators/kernel/arm/split_kernel.cpp
+++ b/src/operators/kernel/arm/split_kernel.cpp
@@ -26,7 +26,7 @@ bool SplitKernel<CPU, float>::Init(SplitParam<CPU> *param) {
 }
 
 template <>
-void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) const {
+void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) {
   SplitCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/sum_kernel.cpp b/src/operators/kernel/arm/sum_kernel.cpp
index 0290037522a2bf3b3c88ce129eda277a401fecb5..2b36a382a1681b08e5f6c87b9031492e81a579cd 100644
--- a/src/operators/kernel/arm/sum_kernel.cpp
+++ b/src/operators/kernel/arm/sum_kernel.cpp
@@ -26,7 +26,7 @@ bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
 }
 
 template <>
-void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) const {
+void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) {
   SumCompute<float>(param);
   param.Out()->set_lod(param.Inputs()[0]->lod());
 }
diff --git a/src/operators/kernel/arm/transpose2_kernel.cpp b/src/operators/kernel/arm/transpose2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..228f210ea1c52f1bfe601bd46f741347dabd6cce
--- /dev/null
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
+  return true;
+}
+
+template <>
+void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
+  Transpose2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp
index bb7a881bdc1d2706a25a77833ca38695ede2fec7..f90376eb507253badb209838a3db4bafbcfbb5b9 100644
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -25,8 +25,7 @@ bool TransposeKernel<CPU, float>::Init(TransposeParam<CPU> *param) {
 }
 
 template <>
-void TransposeKernel<CPU, float>::Compute(
-    const TransposeParam<CPU> &param) const {
+void TransposeKernel<CPU, float>::Compute(const TransposeParam<CPU> &param) {
   TransposeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h
index beac7399583d074956fa4564fdd9312b2d7985f0..1f2db456d360d6eb6c684fb98e3807b07cc89b92 100644
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -22,13 +22,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class BatchNormKernel
     : public framework::OpKernelBase<DeviceType, BatchNormParam<DeviceType>> {
  public:
-  void Compute(const BatchNormParam<DeviceType> &param) const;
+  void Compute(const BatchNormParam<DeviceType> &param);
   bool Init(BatchNormParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/bilinear_interp_kernel.h b/src/operators/kernel/bilinear_interp_kernel.h
index ac3dfcb16190315f72dc60da54c4f944874e4458..9a68fe65a562a8567dab2e5977506e083f7889a2 100644
--- a/src/operators/kernel/bilinear_interp_kernel.h
+++ b/src/operators/kernel/bilinear_interp_kernel.h
@@ -29,7 +29,7 @@ class BilinearInterpKernel
     : public framework::OpKernelBase<DeviceType,
                                      BilinearInterpParam<DeviceType>> {
  public:
-  void Compute(const BilinearInterpParam<DeviceType>& param) const;
+  void Compute(const BilinearInterpParam<DeviceType>& param);
   bool Init(BilinearInterpParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h
index 58144a87349ed3a6504e0074903594be3aa6fe8f..eadb21b3d5ecb95ef82cfef2ac8c3245e925ec7c 100644
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class BoxCoderKernel
     : public framework::OpKernelBase<DeviceType, BoxCoderParam<DeviceType>> {
  public:
-  void Compute(const BoxCoderParam<DeviceType>& param) const;
+  void Compute(const BoxCoderParam<DeviceType>& param);
   bool Init(BoxCoderParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
index 42c01d2825e052a52e7021a1b2a97997fb9c915b..45d5dc76d1e95668638706a252cc24d7ff2dec40 100644
--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -29,10 +29,9 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
   auto *input_z_data = input_z->data<float>();
   int axis = param.Axis();
   Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
   //  int m = out->dims()[0];
   //  int n = out->dims()[1];
-
+  auto *out_data = out->mutable_data<float>();
   const Tensor x_matrix =
       input_x->dims().size() > 2
           ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
index 37479c22efe95b6506054cf3ded5855aa766c34c..941c237865707bce854aedba56029a4f5de9b2bf 100644
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -83,6 +83,7 @@ void PoolCompute(const PoolParam<CPU> &param) {
 #if __aarch64__
     PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
 #else
+    /// todo: fix bug in Pool2x2
     if (pooling_type == "max") {
       math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
     } else if (pooling_type == "avg") {
diff --git a/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/src/operators/kernel/central-arm-func/reshape2_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c22cf120313b039944932fb4e6cc52aa59a68fd4
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/reshape2_arm_func.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+#pragma once
+
+#include <vector>
+#include "operators/kernel/reshape_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void Reshape2Compute(const Reshape2Param<CPU> &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  } else {
+    auto &shape = param.Shape();
+    out_dims = ValidateShape(shape, input_x_dims);
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/softmax_arm_func.h b/src/operators/kernel/central-arm-func/softmax_arm_func.h
index d311d97984a7207df9075befe71a9806092966e1..a94c8299c514bc9e2937daf57b1a845d7be56b16 100644
--- a/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -24,6 +24,7 @@ void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
   Tensor *out = param.Out();
   auto x_dims = in_x->dims();
   out->Resize(x_dims);
+  out->mutable_data<float>();
   math::SoftmaxFuntor<CPU, float>()(in_x, out);
 }
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/transpose2_arm_func.h b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..dea90e863b20f19820d60d9cce67b6849d3c467b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void Transpose2Compute(const Transpose2Param<CPU>& param) {
+  const auto* input_x = param.InputX();
+  const auto input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  const auto axis = param.Axis();
+  const auto* input_x_data = input_x->data<float>();
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d5695cb80736dcc126ce5f726c0a2566884fe45
--- /dev/null
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BATCHNORM_OP
+
+#include "operators/kernel/batchnorm_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         this->cl_helper_.CLCommandQueue());
+
+  framework::CLImage *new_bias = new framework::CLImage();
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        this->cl_helper_.CLCommandQueue());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  return true;
+}
+
+template <>
+void BatchNormKernel<GPU_CL, float>::Compute(
+    const BatchNormParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
+
+  auto input = param.InputX()->GetCLImage();
+  auto out = param.OutputY()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  const int out_width = default_work_size[1];
+
+  clSetKernelArg(kernel, 1, sizeof(int), &out_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
+
+  //  cl_event out_event = param.OutputY()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}
+
+template class BatchNormKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..9d0857a45e0766482e2dbb6ded77edb07517bc0f
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void batchnorm(__private const int out_width,
+                        __read_only image2d_t input,
+                        __read_only image2d_t new_scale_image,
+                        __read_only image2d_t new_bias_image,
+                        __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0));
+  half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0));
+
+  int pos_x = mad24(out_c, out_width, out_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
+  half4 out = mad(in, new_scale, new_bias);
+
+  write_imageh(output, (int2)(pos_x, out_nh), out);
+}
diff --git a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1f2e36687ab04be2b8c18b26e868b7709bc3c231
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     int2 coords_bias;
+     coords_bias.x = x/w;
+     coords_bias.y = 0;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords_bias);
+     half4 output = in + biase;
+     write_imageh(outputImage,coords,output);
+ }
diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..34f36eb9a3ffbdc5781c974926ea4a7d5258636b
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+inline half4 activation(half4 in
+#ifdef PRELU
+                        ,
+                        half4 prelu_alpha
+#endif
+) {
+  half4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+#endif
+
+#ifdef RELU
+  output = fmax(in, (half4)(0.0f));
+#endif
+  return output;
+}
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..aa3eaedda5634294f231831d550296dfdba0dd48
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -0,0 +1,19 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define BATCH_NORM
+#define RELU
+
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b8bf7e7d7d9fbb9eb9e930e9c1c3a58bb3391efc
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..8d686c20dfaa31204a4c44105fb479423352fb9e
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define RELU
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
new file mode 100644
index 0000000000000000000000000000000000000000..63e6e62345c8034ef914b4c385e6fd976b267c4c
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -0,0 +1,701 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+conv
+conv_bn
+conv_add
+conv_relu
+conv_bn_relu
+conv_add_relu
+conv_add_bn_relu
+*/
+
+#include "cl_common.h"
+
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   half4 input[9];
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+
+/*
+        for (int j = 0; j < 9; ++j) {
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            float4 weight_x = read_imagef(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            float4 weight_y = read_imagef(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            float4 weight_z = read_imagef(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            float4 weight_w = read_imagef(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+        }
+*/
+            int j = 0;
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 1;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 2;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 3;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 4;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 5;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+           j = 6;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 7;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 8;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+
+
+
+
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+    const int filter_width = 3;
+    const int filter_height = 3;
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+
+    half4 inputs[9];
+
+        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              half4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+
+
+        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+    half4 filters[9];
+    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+
+    /*
+
+    if (output_pos.x == 112 && output_pos.y == 0) {
+
+        for (int i = 0; i < 9; ++i) {
+            half4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+
+    */
+
+    write_imageh(output_image, output_pos, output);
+
+}
+
+
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+/*
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
+
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
+
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+
+
+
+/*
+
+__kernel void conv_1x1_4(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,
+                       __private const int input_height,
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0) * 4;
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE
+    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
+#else
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+    half4 output2 = 0.0f;
+    half4 output3 = 0.0f;
+#endif
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+
+        output0 = mad(input.x, weight0_0, output0);
+        output0 = mad(input.y, weight0_1, output0);
+        output0 = mad(input.z, weight0_2, output0);
+        output0 = mad(input.w, weight0_3, output0);
+
+        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
+        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
+        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
+        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
+
+        output1 = mad(input.x, weight1_0, output1);
+        output1 = mad(input.y, weight1_1, output1);
+        output1 = mad(input.z, weight1_2, output1);
+        output1 = mad(input.w, weight1_3, output1);
+
+        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
+        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
+        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
+        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
+
+        output2 = mad(input.x, weight2_0, output2);
+        output2 = mad(input.y, weight2_1, output2);
+        output2 = mad(input.z, weight2_2, output2);
+        output2 = mad(input.w, weight2_3, output2);
+
+        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
+        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
+        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
+        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
+
+        output3 = mad(input.x, weight3_0, output3);
+        output3 = mad(input.y, weight3_1, output3);
+        output3 = mad(input.z, weight3_2, output3);
+        output3 = mad(input.w, weight3_3, output3);
+
+   }
+
+#ifdef BATCH_NORM
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
+
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
+
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
+
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
+
+#endif
+
+#ifdef RELU
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
+#endif
+
+  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos0, output0);
+
+
+  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos1, output1);
+
+
+  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos2, output2);
+
+
+  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos3, output3);
+}
+
+*/
+
+
+
+
+
+
+
+
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..3c3497f917d8a16c7c7e304edf00a4250066dce7
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define BATCH_NORM
+#define RELU
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f304764868959ce028a8448c4d311db878cc1f6e
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = in + biase;
+     write_imageh(outputImage,coords,output);
+ }
diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..200a221c9bda49c42f2caff374fc24d6e4df27e5
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c)
+ {
+        int i = get_global_id(0);
+        int j = get_global_id(1);
+        half4 pixel;
+        pixel.x = convert_half(in[(i * w + j)]);
+        if(c>=2){
+        pixel.y = convert_half(in[h * w + (i * w + j)]);
+        }else{
+        pixel.y = 0.0;
+        }
+        if(c>=3){
+        pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
+        }else{
+         pixel.z = 0.0;
+        }
+        pixel.w = 0.0;
+        int2 coords;
+        coords.x = j;
+        coords.y = i;
+
+        write_imageh(outputImage,coords,pixel);
+ }
diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..64bb1845b0bd2c04c8761845b90dbed9e391a77b
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void fetch(__private const int in_height,
+                    __private const int in_width,
+                    __read_only image2d_t input,
+                    __global float* out,
+                    __private const int size_ch,
+                    __private const int size_block,
+                    __private const int size_batch) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  const int pos_x = mad24(in_c, in_width, in_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
+
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = convert_float(in.x);
+  out[index + size_ch] = convert_float(in.y);
+  out[index + size_ch * 2] = convert_float(in.z);
+  out[index + size_ch * 3] = convert_float(in.w);
+}
+
+__kernel void fetch_2d(__private const int in_height,
+                       __private const int in_width,
+                       __read_only image2d_t input,
+                       __global float* out) {
+  const int in_w = get_global_id(1);
+  const int in_h = get_global_id(2);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, (int2)(in_w, in_h));
+
+  const int index = (in_h * in_width + in_w) * 4;
+  out[index] = convert_float(in.x);
+  out[index + 1] = convert_float(in.y);
+  out[index + 2] = convert_float(in.z);
+  out[index + 3] = convert_float(in.w);
+}
diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..fc660941f8863a0056c4618f0207ae69533d3242
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define MIN_VALUE -FLT_MAX
+
+__kernel void pool_max(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 max_value = (half4)(MIN_VALUE);
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      max_value = max(max_value, tmp);
+    }
+  }
+
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
+}
+
+__kernel void pool_avg(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 sum = (half4)(0.0f);
+  int num = 0;
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      num++;
+    }
+  }
+  half4 avg = sum / num;
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
+}
diff --git a/src/operators/kernel/cl/cl_kernel/relu.cl b/src/operators/kernel/cl/cl_kernel/relu.cl
new file mode 100644
index 0000000000000000000000000000000000000000..cc8f9c3742f7794c51a5e04ac4edde617af0e388
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void relu(__read_only image2d_t input,
+                   __write_only image2d_t output){
+
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  write_imageh(output, (int2)(x, y), in);
+}
+
+__kernel void relu_p0(__read_only image2d_t input,
+                   __write_only image2d_t output){
+
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  write_imageh(output, (int2)(x, y), in);
+}
+__kernel void relu_p1(__read_only image2d_t input,
+                   __write_only image2d_t output){
+
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  write_imageh(output, (int2)(x, y), in);
+}
diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc64f15cd531879de4852f976769790b6bafe4
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void reshape(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3,
+                      __private const int x0,
+                      __private const int x1,
+                      __private const int x2,
+                      __private const int x3) {
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+
+  write_imageh(output, (int2)(x, y), in);
+}
+
+
+/*
+
+__kernel void reshape(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3,
+                      __private const int x0,
+                      __private const int x1,
+                      __private const int x2,
+                      __private const int x3) {
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  int obx = x / x3;
+  int oby = y / x2;
+  int ox = x % x3;
+  int oy = y % x2;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 r;
+  for (int i = 0; i < 4; i++) {
+    int t = obx * 4 + i;
+    if (t > x1) break;
+    int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy;
+    int i3 = oindex % d3; oindex /= d3;
+    int i2 = oindex % d2; oindex /= d2;
+    int i1 = oindex % d1; oindex /= d1;
+    int i0 = oindex;
+    int ix = (i1 / 4) * d3 + i3;
+    int iy = i0 * d2 + i2;
+    half4 p = read_imageh(input, sampler, (int2)(ix, iy));
+    ((half*)&r)[i] = ((half*)&p)[i1%4];
+  }
+  write_imageh(output, (int2)(x, y), r);
+}
+
+*/
diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl
new file mode 100644
index 0000000000000000000000000000000000000000..215ec69fc283dcb2b538300cb5591b2b9e4b6a13
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void softmax(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int group
+                      ) {
+    const int out_c = get_global_id(0);   //  block index
+    const int out_w = get_global_id(1);   // index in one block
+    const int out_nh = get_global_id(2);
+
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  half maxv = 0.0f;
+  for (int i = 0; i < group; ++i) {
+    half4 temp = read_imageh(input_image, sampler, (int2)(i, 0));
+    maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
+  }
+
+
+  half4 rsum = (half4)(0.0f);
+  for (int i = 0; i < group; ++i) {
+    half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
+    rsum += convert_half4(exp(convert_float4(r - maxv)));
+  }
+
+  float sum = rsum.x + rsum.y + rsum.z + rsum.w;
+
+  half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
+  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
+  write_imageh(output_image, (int2)(out_w, out_nh), result);
+}
+
+/*
+
+__kernel void softmax(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3) {
+  const int z = get_global_id(0);
+  const int x = get_global_id(1);
+  const int y = get_global_id(2);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 cv = read_imageh(input, sampler, (int2)(x, y));
+  half4 maxv = cv;
+  for (int i = 0; i < d3; i++) {
+    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
+    maxv = max(maxv, temp);
+  }
+  half4 sum = (half4)0.0f;
+  // half4 x = = (half4)0.0f;
+  for (int i = 0; i < d3; i++) {
+    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
+    sum += exp(temp - maxv);
+  }
+  half4 r = exp(cv - maxv) / sum;
+
+  write_imageh(output, (int2)(z * d3 + x, y), r);
+}
+
+*/
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33172e4f0343f1bb26e34f6c7d3b009629b60430
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,289 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "framework/cl/cl_image.h"
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<GPU_CL, float>::Init(
+    FusionConvAddBNReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             cl_helper_.CLCommandQueue());
+
+  //  const CL *mean = param->InputMean();
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  const int C = mean->numel();
+
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " mean - " << j << mean->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " variance - " << j << variance->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " scale - " << j << scale->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " bias - " << j << bias->data<float>()[j];
+  //  }
+
+  //
+  //  DLOG << " climage mean: " << *mean;
+  //  DLOG << " climage variance: " << *variance;
+  //  DLOG << " climage scale: " << *scale;
+  //  DLOG << " climage bias: " << *bias;
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //  }
+
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - y bias: " << *(param->Bias());
+  //
+  //  DLOG << " climage - new scale: " << *new_scale;
+
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - new bias: " << *new_bias;
+  //
+  //  DLOG << " climage - filter: " << *(param->Filter());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  /*
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1 &&
+      (param->Filter()->dims()[0] % 16) == 0) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1 4";
+  }
+  */
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1";
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu depth_conv_3x3";
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv_3x3";
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddBNReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  //  DLOG << " c block " << c_block;
+  //  DLOG << " w " << w;
+  //  DLOG << " nh " << nh;
+  //  DLOG << " stride " << stride;
+  //  DLOG << " offset " << offset;
+  //  DLOG << " input_c " << input_c;
+  //  DLOG << " dilation " << dilation;
+  //  DLOG << " input width " << input_width;
+  //  DLOG << " input height " << input_height;
+  //  DLOG << " output width " << output_width;
+  //  DLOG << " output height " << output_height;
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  /*
+  if (param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1 &&
+      param.Filter()->dims()[0] % 16 == 0) {
+    DLOG << " before modifi work size: " << default_work_size;
+
+    default_work_size[0] = default_work_size[0] / 4;
+
+    DLOG << " modification work size: " << default_work_size;
+    DLOG << " input dims " << param.Input()->dims();
+    DLOG << " output dims " << param.Output()->dims();
+    DLOG << " filter dims: " << param.Filter()->dims();
+    DLOG << " biase dims : " << param.Bias()->dims();
+
+  }
+  */
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvAddBNReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e30c6d31db645fb5d18bf70ef5b6876a5f683da
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
+
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvAddKernel<GPU_CL, float>::Compute(
+    const FusionConvAddParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvAddKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..814cff634cb0c4c2d5dd6e6706b558bb1cd64f22
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<GPU_CL, float>::Init(
+    FusionConvAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvAddReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05cefadce052fb65664cc797c800ec67e43f3a2c
--- /dev/null
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  auto filter_ddim = param->Filter()->dims();
+
+  std::vector<int64_t> filter_shape(
+      {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]});
+  framework::DDim ddim = framework::make_ddim(filter_shape);
+  if (filter_ddim[1] == 1) {
+    param->Filter()->Resize(ddim);
+  }
+
+  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                               this->cl_helper_.CLCommandQueue());
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  DLOG << " init helper: " << &cl_helper_;
+  DLOG << " conv kernel add kernel ~ ";
+  DLOG << " width of one block: " << param->Filter()->dims()[3];
+  DLOG << " height of one block: " << param->Filter()->dims()[2];
+  DLOG << " filter dims: " << param->Filter()->dims();
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    DLOG << " here1 ";
+    this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
+
+  } else if (param->Filter()->dims()[0] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    DLOG << " here2 ";
+    this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    DLOG << " here3 ";
+    this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
+
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  DLOG << " begin set kernel arg ";
+  DLOG << " c block " << c_block;
+  DLOG << " w " << w;
+  DLOG << " nh " << nh;
+  DLOG << " stride " << stride;
+  DLOG << " offset " << offset;
+  DLOG << " input_c " << input_c;
+  DLOG << " dilation " << dilation;
+  DLOG << " input width " << input_width;
+  DLOG << " input height " << input_height;
+  DLOG << " output width " << output_width;
+  DLOG << " output height " << output_height;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35813a31f570c8daf956e4c90d0f3e3de1675eb4
--- /dev/null
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DEPTHWISECONV_OP
+
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
+  DLOG << " depthwise conv kernel init begin ";
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                               this->cl_helper_.CLCommandQueue());
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  DLOG << " depthwise conv kernel init end ";
+  return true;
+}
+
+template <>
+void DepthwiseConvKernel<GPU_CL, float>::Compute(
+    const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
+}
+
+template class DepthwiseConvKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e62714b3fa3182706270627e7fd1a13b06f3b66a
--- /dev/null
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddKernel<GPU_CL, float>::Init(
+    ElementwiseAddParam<GPU_CL> *param) {
+  DLOG << "-----init add-----";
+  CLImage *bias = (CLImage *)(param->InputY());
+  bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
+  DLOG << " bias: " << *bias;
+  if (bias->dims().size() == 4) {
+    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  } else if (param->InputY()->dims().size() == 1) {
+    this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+
+  return true;
+}
+
+template <>
+void ElementwiseAddKernel<GPU_CL, float>::Compute(
+    const ElementwiseAddParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    int width = input->ImageWidth();
+    int height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 1) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[3];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    int width = input->ImageWidth();
+    int height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    cl_event out_event = param.Out()->GetClEvent();
+    cl_event wait_event = param.InputX()->GetClEvent();
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+}
+
+template class ElementwiseAddKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78f04357a23c70595595cc24489fd96e994162fb
--- /dev/null
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+#include "framework/cl/cl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
+  DLOG << "Init feed";
+  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
+  return true;
+}
+
+template <>
+void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  cl_int status;
+  auto output = param.Out();
+  const Tensor *input = param.InputX();
+  //  DLOG << *input;
+  const float *input_data = input->data<float>();
+  int numel = input->numel();
+  cl_mem cl_image = output->GetCLImage();
+  int c = input->dims()[1];
+  int height = output->dims()[2];
+  int width = output->dims()[3];
+  CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
+                           this->cl_helper_.CLCommandQueue());
+  input_cl_tensor.Resize(input->dims());
+  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c);
+  CL_CHECK_ERRORS(status);
+
+  size_t global_work_size[2] = {width, height};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                  NULL, global_work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class FeedKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31c1d4179cbdfc8145d90bee2353be821e65b40b
--- /dev/null
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/fetch_kernel.h"
+#include "framework/cl/cl_tensor.h"
+// #include "common/common.h"
+// #include <iostream>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
+  if (param->InputX()->dims().size() <= 2) {
+    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  } else {
+    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  }
+  auto *out = param->Out();
+  out->mutable_data<float>();
+  return true;
+}
+
+template <>
+void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+
+  auto input = param.InputX()->GetCLImage();
+  auto *out = param.Out();
+
+  const auto &dim = param.InputX()->dims();
+  size_t new_dims[] = {1, 1, 1, 1};
+
+  for (int j = 0; j < dim.size(); ++j) {
+    new_dims[4 - dim.size() + j] = dim[j];
+  }
+
+  size_t C, in_height, in_width;
+
+  C = new_dims[1];
+  in_height = new_dims[2];
+  if (dim.size() <= 2) {
+    in_width = param.InputX()->ImageWidth();
+  } else {
+    in_width = new_dims[3];
+  }
+
+  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
+                         this->cl_helper_.CLCommandQueue());
+  out_cl_tensor.Resize(out->dims());
+  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
+  if (dim.size() > 2) {
+    int size_ch = in_height * in_width;
+    int size_block = size_ch * 4;
+    int size_batch = size_ch * C;
+    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  }
+
+  //  cl_event wait_event = param.InpdutX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+
+  //  auto time1 = paddle_mobile::time();
+
+  //  printf(" before finish \n");
+  //  clFlsh(this->cl_helper_.CLCommandQueue());
+  clFinish(this->cl_helper_.CLCommandQueue());
+  //  printf(" after finish \n");
+
+  //  auto time2 = paddle_mobile::time();
+  //
+  //
+  //  std::cout << " finish  cost :" << paddle_mobile::time_diff(time1, time2)
+  //            << "ms" << std::endl;
+
+  memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
+}
+
+template class FetchKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/pool_kernel.cpp b/src/operators/kernel/cl/pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df79ababadd4c1b959a1eb0fe237a45ab97a6bd8
--- /dev/null
+++ b/src/operators/kernel/cl/pool_kernel.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+
+#include "operators/kernel/pool_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
+  std::string pooling_type = param->PoolingType();
+  this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
+  return true;
+}
+
+template <>
+void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+
+  auto input = param.Input()->GetCLImage();
+  auto out = param.Output()->GetCLImage();
+
+  framework::CLImageConverterFolder *input_folder_converter =
+      reinterpret_cast<framework::CLImageConverterFolder *>(
+          param.Input()->Converter());
+  framework::CLImageConverterFolder *output_folder_converter =
+      reinterpret_cast<framework::CLImageConverterFolder *>(
+          param.Output()->Converter());
+
+  const int in_height = input_folder_converter->HeightOfOneBlock();
+  const int in_width = input_folder_converter->WidthOfOneBlock();
+  const int out_height = output_folder_converter->HeightOfOneBlock();
+  const int out_width = output_folder_converter->WidthOfOneBlock();
+
+  std::string pooling_type = param.PoolingType();
+  std::vector<int> ksize = param.Ksize();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  const int pad_top = paddings[0];
+  const int pad_left = paddings[1];
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  const int ksize_h = ksize[0];
+  const int ksize_w = ksize[1];
+
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
+  clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
+  clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
+  clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
+  clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
+  clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
+  clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}
+
+template class PoolKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3acfe442201a9be59c6f0a0a536cf9aea68c4a2
--- /dev/null
+++ b/src/operators/kernel/cl/relu_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+
+#include "operators/kernel/relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("relu", "relu.cl");
+  //  this->cl_helper_.AddKernel("relu_p0", "relu.cl");
+  //  this->cl_helper_.AddKernel("relu_p1", "relu.cl");
+  //  const auto dim =
+  //      const_cast<framework::CLImage*>(param->InputX())->ImageDims();
+  //  param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
+  //                                      this->cl_helper_.CLCommandQueue(),
+  //                                      dim);
+  return true;
+}
+
+template <>
+void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  //  auto kernel_p0 = this->cl_helper_.KernelAt(1);
+  //  auto kernel_p1 = this->cl_helper_.KernelAt(2);
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  //  auto tImage =
+  //      const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
+  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
+  //  clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
+  //  clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
+  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
+  //  NULL,
+  //                         work_size, NULL, 0, NULL, NULL);
+}
+
+template class ReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a
--- /dev/null
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/reshape_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("reshape", "reshape.cl");
+  return true;
+}
+
+template <>
+void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  const auto *input = param.InputX();
+  auto *output = param.Out();
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  const auto &inputDim = input->dims();
+  const auto &outputDim = output->dims();
+  int dims[4] = {1, 1, 1, 1};
+  int odims[4] = {1, 1, 1, 1};
+  // 1 1000 1 1
+  for (int i = 0; i < inputDim.size(); i++) {
+    dims[4 - inputDim.size() + i] = inputDim[i];
+  }
+
+  // 1 1 1 1000
+  for (int i = 0; i < outputDim.size(); i++) {
+    odims[4 - outputDim.size() + i] = outputDim[i];
+  }
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
+  clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
+  clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
+  clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
+  clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
+  clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
+  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+}
+
+template class ReshapeKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..22e6672ee462b963476dc72895329a9117fc16a8
--- /dev/null
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "operators/kernel/softmax_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<GPU_CL, float>::Init(SoftmaxParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("softmax", "softmax.cl");
+  return true;
+}
+
+template <>
+void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  const auto *input = param.InputX();
+  auto *output = param.Out();
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+
+  int group = output->ImageWidth();
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &group);
+
+  //  const auto &inputDim = input->dims();
+  //
+  //  int dims[4] = {1, 1, 1, 1};
+  //
+  //  for (int i = 0; i < inputDim.size(); i++) {
+  //    dims[4 - inputDim.size() + i] = inputDim[i];
+  //  }
+  //
+  //  clSetKernelArg(kernel, 2, sizeof(int), &dims);
+  //  clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
+  //  clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
+  //  clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
+
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
+}
+
+template class SoftmaxKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h
index 61100bf5f0e9de43bfb6295a0719f1be0954d128..ac9ebca4d5ab30307303b8720677e67470634b44 100644
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ConcatKernel
     : public framework::OpKernelBase<DeviceType, ConcatParam<DeviceType>> {
  public:
-  void Compute(const ConcatParam<DeviceType> &param) const;
+  void Compute(const ConcatParam<DeviceType> &param);
   bool Init(ConcatParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h
index 5715cd46d5a6c7e80ab5ff77ba83c7973e1db811..fadaf7564ceeb7a52215dc335135016be02bc1ab 100644
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddAddPReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param);
   bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h
index ee73215c4688c3e604de69cda55b05e63844c0b8..7a921ecc7d0f4498cae80fbb9cea1b13e4c94101 100644
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddBNKernel
     : public OpKernelBase<DeviceType, FusionConvAddBNParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddBNParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddBNParam<DeviceType> &param);
   bool Init(FusionConvAddBNParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h
index 9faaaedcf8d6f825f818ebf5121dc7685185d5d8..3f088528fc901987873038c7e1dd779dcc2019e7 100644
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddBNReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddBNReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
   bool Init(FusionConvAddBNReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
index 360cbb6775168885e9c1a25db1f9ffb9e552324b..4e9ff0853f1d502ebb4dc4ef3641d0a879f32b60 100644
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -40,7 +40,7 @@ template <typename DeviceType, typename T>
 class ConvAddKernel
     : public OpKernelBase<DeviceType, FusionConvAddParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddParam<DeviceType> &param);
   bool Init(FusionConvAddParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h
index a109f84cf09b4d0e2469a1885b902c0f70acc6c8..631982789b09c57d0d21186d0a30df7368d2955f 100644
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddPReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddPReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddPReluParam<DeviceType> &param);
   bool Init(FusionConvAddPReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index f33b1dc312e1d94be0c23cff55e9e6789a556bc7..e001926b361da96ec3ff76e120bc3d1ad13714fa 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddReluParam<DeviceType> &param);
   bool Init(FusionConvAddReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h
index 820e5f8bcbf58676e8374e575044b10fe4676efa..dcd8fecf07fbb4ea75b382f5315e24e64e26e939 100644
--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNAddReluKernel
     : public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvBNAddReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNAddReluParam<DeviceType> &param);
   bool Init(FusionConvBNAddReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h
index f740ca836481c1331ea2e889865b3078d48644a6..e669f3bdd85dbd89e3a48d417dcd0cd6b9706062 100644
--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNKernel
     : public OpKernelBase<DeviceType, FusionConvBNParam<DeviceType>> {
  public:
-  void Compute(const FusionConvBNParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNParam<DeviceType> &param);
   bool Init(FusionConvBNParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h
index 225976aa5db31096ef691ecefa8b63d4ae3dc277..91b3413116ae22a8e212cf149c4e0c2a8924664a 100644
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNReluKernel
     : public OpKernelBase<DeviceType, FusionConvBNReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNReluParam<DeviceType> &param);
   bool Init(FusionConvBNReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index 93474adaa97743d1850b53df114ae08f144aebca..cac498c36bd5debef0ff996cdf017355a2371a18 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -31,7 +31,7 @@ using framework::OpKernelBase;
 template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
  public:
-  void Compute(const ConvParam<DeviceType> &param) const;
+  void Compute(const ConvParam<DeviceType> &param);
   bool Init(ConvParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h
index 761370095cae9751eb479521d6378c4f7ccaefe5..6341a87d43fdb3a3ca63fadd90239bdf2a6921a8 100644
--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class ConvTransposeKernel
     : public OpKernelBase<DeviceType, ConvTransposeParam<DeviceType>> {
  public:
-  void Compute(const ConvTransposeParam<DeviceType> &param) const;
+  void Compute(const ConvTransposeParam<DeviceType> &param);
 
   bool Init(ConvTransposeParam<DeviceType> *param);
 };
diff --git a/src/operators/kernel/crf_kernel.h b/src/operators/kernel/crf_kernel.h
index 71c07cf0384d482522de3a6652c6d24a22af656a..1436aafc0603d4c7ba9ecae911f10bd8f297852a 100644
--- a/src/operators/kernel/crf_kernel.h
+++ b/src/operators/kernel/crf_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class CrfKernel
     : public framework::OpKernelBase<DeviceType, CrfParam<DeviceType>> {
  public:
-  void Compute(const CrfParam<DeviceType>& param) const;
+  void Compute(const CrfParam<DeviceType>& param);
   bool Init(CrfParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/deconv_relu_kernel.h b/src/operators/kernel/deconv_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc85f1ffee19abe3941bd9d90fb8dfd04280ce14
--- /dev/null
+++ b/src/operators/kernel/deconv_relu_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DeconvReluKernel
+    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDeconvReluParam<DeviceType> &param);
+
+  bool Init(FusionDeconvReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
index 605b81cd6ed4ccd54b1803cf7a603b8f4576982d..3ee5bf86e97baa3970239e32b7fd5fc341e09f92 100644
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -31,7 +31,7 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel
     : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
  public:
-  void Compute(const ConvParam<DeviceType> &param) const;
+  void Compute(const ConvParam<DeviceType> &param);
   bool Init(ConvParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h
index d147e3f94ab87165cceac886289e74747906e047..6ba8ec88c52f20ccfcd30d5b9a217eaef658d507 100644
--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class DequantizeKernel
     : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
  public:
-  void Compute(const DequantizeParam<DeviceType> &param) const;
+  void Compute(const DequantizeParam<DeviceType> &param);
   bool Init(DequantizeParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
index b7535095d4fef11ee628aea96a074abcc3562f7f..2f59d01b6723eea274b1ed059ae08863a4937961 100644
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class DropoutKernel
     : public framework::OpKernelBase<DeviceType, DropoutParam<DeviceType>> {
  public:
-  void Compute(const DropoutParam<DeviceType>& param) const;
+  void Compute(const DropoutParam<DeviceType>& param);
   bool Init(DropoutParam<DeviceType>* para);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h
index 594c594cb00f8f4ddd8a511f3c992c4efbfcdfc6..f2e4c0afbd0aaafff5339816764f9e30592f122c 100644
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class DWConvBNReluKernel
     : public OpKernelBase<DeviceType, FusionDWConvBNReluParam<DeviceType>> {
  public:
-  void Compute(const FusionDWConvBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionDWConvBNReluParam<DeviceType> &param);
   bool Init(FusionDWConvBNReluParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h
index 67182af2e20e23c40effab6b87eefde1e0ab629d..8fa07e519ec0b78baffabd08fb7e524f8259c9eb 100644
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -30,7 +30,7 @@ class ElementwiseAddKernel
     : public framework::OpKernelBase<DeviceType,
                                      ElementwiseAddParam<DeviceType>> {
  public:
-  void Compute(const ElementwiseAddParam<DeviceType> &param) const;
+  void Compute(const ElementwiseAddParam<DeviceType> &param);
   bool Init(ElementwiseAddParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/elementwise_add_relu_kernel.h b/src/operators/kernel/elementwise_add_relu_kernel.h
index 5eda5a0c56c228ad54c888b6faa82ce9417f2dc1..d18c4e27fa3345b1818d0e6149fc8fb83195f644 100644
--- a/src/operators/kernel/elementwise_add_relu_kernel.h
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
@@ -29,7 +29,7 @@ class ElementwiseAddReluKernel
     : public framework::OpKernelBase<DeviceType,
                                      ElementwiseAddReluParam<DeviceType>> {
  public:
-  void Compute(const ElementwiseAddReluParam<DeviceType> &param) const;
+  void Compute(const ElementwiseAddReluParam<DeviceType> &param);
   bool Init(ElementwiseAddReluParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h
index 63f0df4815dc143e482140a855eb254bd016d50c..54baa50fcafb8ddbbefecb635ea85f120f16250d 100644
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
@@ -28,7 +28,7 @@ class ElementwiseMulKernel
     : public framework::OpKernelBase<DeviceType,
                                      ElementwiseMulParam<DeviceType>> {
  public:
-  void Compute(const ElementwiseMulParam<DeviceType> &param) const;
+  void Compute(const ElementwiseMulParam<DeviceType> &param);
   bool Init(ElementwiseMulParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/elementwise_sub_kernel.h b/src/operators/kernel/elementwise_sub_kernel.h
index 9516dcbd3de09debe233571eb5f60b3b8b19a2fa..89536b920837b57c4017ccadff7ea6e233cd999e 100644
--- a/src/operators/kernel/elementwise_sub_kernel.h
+++ b/src/operators/kernel/elementwise_sub_kernel.h
@@ -28,7 +28,7 @@ class ElementwiseSubKernel
     : public framework::OpKernelBase<DeviceType,
                                      ElementwiseSubParam<DeviceType>> {
  public:
-  void Compute(const ElementwiseSubParam<DeviceType> &param) const;
+  void Compute(const ElementwiseSubParam<DeviceType> &param);
   bool Init(ElementwiseSubParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/fc_relu_kernel.h b/src/operators/kernel/fc_relu_kernel.h
index 6e9446da37df4ba83db85d416aa87f216816c4a5..6735a50bee86e25d9f8d091b6218a472f3838aec 100644
--- a/src/operators/kernel/fc_relu_kernel.h
+++ b/src/operators/kernel/fc_relu_kernel.h
@@ -28,7 +28,7 @@ class FusionFcReluKernel
     : public framework::OpKernelBase<DeviceType,
                                      FusionFcReluParam<DeviceType>> {
  public:
-  void Compute(const FusionFcReluParam<DeviceType>& param) const;
+  void Compute(const FusionFcReluParam<DeviceType>& param);
   bool Init(FusionFcReluParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/feed_kernel.h b/src/operators/kernel/feed_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b1220fee534040e5ccae5aee84adf3b4b6290b9
--- /dev/null
+++ b/src/operators/kernel/feed_kernel.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class FeedKernel
+    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
+ public:
+  void Compute(const FeedParam<DeviceType> &param);
+  bool Init(FeedParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fetch_kernel.h b/src/operators/kernel/fetch_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9ed91855d0db5149cc8cf4f5d571afd1fbea98f
--- /dev/null
+++ b/src/operators/kernel/fetch_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class FetchKernel
+    : public framework::OpKernelBase<DeviceType, FetchParam<DeviceType>> {
+ public:
+  void Compute(const FetchParam<DeviceType> &param);
+  bool Init(FetchParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/flatten_kernel.h b/src/operators/kernel/flatten_kernel.h
index 80d66ccf87c21532c8b4590d992f5bccbe4f00dc..4846725bcb6522389d29e137980b9d53e63f9f32 100644
--- a/src/operators/kernel/flatten_kernel.h
+++ b/src/operators/kernel/flatten_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class FlattenKernel
     : public framework::OpKernelBase<DeviceType, FlattenParam<DeviceType>> {
  public:
-  void Compute(const FlattenParam<DeviceType>& param) const;
+  void Compute(const FlattenParam<DeviceType>& param);
   bool Init(FlattenParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/V1/concat_kernel.cpp
similarity index 99%
rename from src/operators/kernel/fpga/concat_kernel.cpp
rename to src/operators/kernel/fpga/V1/concat_kernel.cpp
index f61afd4a5c514ced87396313ea5d645fe830e12a..6644bfd83e57a7fd147c0cc6383e64eb2ad79e51 100644
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/concat_kernel.cpp
@@ -58,7 +58,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
 }
 
 template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
   ComputeFPGAConcat(param.FpgaArgs());
 }
 template class ConcatKernel<FPGA, float>;
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
similarity index 98%
rename from src/operators/kernel/fpga/conv_add_bn_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
index 9b3944fc9a9ab308d9fe8b791a34e09651b87e6e..679a95ff54168da821ed0debb80b6bce8eca407b 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
@@ -78,7 +78,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
 
 template <>
 void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) const {
+    const FusionConvAddBNParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
similarity index 98%
rename from src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
index 83f74e97d04eda29f3aaa6a0cc16ed7d194321d8..6c99750eb824940b32a857ee2baffc72bce05a7a 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
@@ -76,7 +76,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 
 template <>
 void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) const {
+    const FusionConvAddBNReluParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
similarity index 97%
rename from src/operators/kernel/fpga/conv_add_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
index 4975f2a905dcd76c5b7f013eafaa376dd2bb1646..ce2fbbda0ee4c7e0a1e97b45674ef269df3be3be 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
 
 template <>
 void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) const {
+    const FusionConvAddReluParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
similarity index 96%
rename from src/operators/kernel/fpga/conv_bn_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
index 276e71b6a44e9a7beba0d5db2f51472a9927d8da..ac9f19e411a87bb31e320df504a0e1c88e195454 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
@@ -69,8 +69,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
 }
 
 template <>
-void ConvBNKernel<FPGA, float>::Compute(
-    const FusionConvBNParam<FPGA> &param) const {
+void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
similarity index 98%
rename from src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
index f519a37cb57378a603969adae255f88ae8a5df2a..4c9eb391ada9366478877494fbe466d5cf919327 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -70,7 +70,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
 
 template <>
 void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) const {
+    const FusionConvBNReluParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/V1/dropout_kernel.cpp
similarity index 91%
rename from src/operators/kernel/fpga/dropout_kernel.cpp
rename to src/operators/kernel/fpga/V1/dropout_kernel.cpp
index b0981c4254060996a16f4ae5beabb7c22edd6d34..8b990d46e0b90bf67eaf36bbf38238fd4432ace6 100644
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/dropout_kernel.cpp
@@ -26,8 +26,7 @@ bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
 }
 
 template <>
-void DropoutKernel<FPGA, float>::Compute(
-    const DropoutParam<FPGA> &param) const {}
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
similarity index 97%
rename from src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
index b592dd6d59a5d5cec8f12ef304099d2b89a10a05..5253d4d0d3e00190b4ed594279d9190659ec6026 100644
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
@@ -56,7 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
 
 template <>
 void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) const {
+    const ElementwiseAddReluParam<FPGA> &param) {
   fpga::ComputeFpgaEWAdd(param.FpgaArgs());
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp
similarity index 98%
rename from src/operators/kernel/fpga/fc_relu_kernel.cpp
rename to src/operators/kernel/fpga/V1/fc_relu_kernel.cpp
index 52d7c0a4e69080e11f86d1507829e7e779a69228..2c6b616689dca14474d1cbdc3769b438de1358e4 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp
@@ -61,7 +61,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
 }
 template <>
 void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) const {
+    const FusionFcReluParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 
diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..161d8c9f0cf22ac79d1367e07b8ba3318a7a7123
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6e4591168b90cbe19b207cd9e77eaf5cd07de80
--- /dev/null
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/fetch_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+
+template class FetchKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
similarity index 96%
rename from src/operators/kernel/fpga/fusion_fc_kernel.cpp
rename to src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
index 407e14238d542604e876ced624d5a0db698a6101..9258fb90e1e6bf9a597a387843ce781858628139 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
@@ -62,8 +62,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
 }
 
 template <>
-void FusionFcKernel<FPGA, float>::Compute(
-    const FusionFcParam<FPGA> &param) const {
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp
similarity index 96%
rename from src/operators/kernel/fpga/pool_kernel.cpp
rename to src/operators/kernel/fpga/V1/pool_kernel.cpp
index 6269506836c25d756040cd25cf9b0189fd03d89b..8eefc3e9bea0b3662b4c08409f16f86dab60968a 100644
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
@@ -34,7 +34,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   fpga::PoolingArgs poolArgs = {0};
   poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
   poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
   poolArgs.image.address = input_ptr;
   poolArgs.image.channels = (uint32_t)input->dims()[1];
   poolArgs.image.height = (uint32_t)input->dims()[2];
@@ -53,7 +53,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 }
 
 template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const {
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
   fpga::ComputeFpgaPool(param.FpgaArgs());
 }
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
similarity index 88%
rename from src/operators/kernel/fpga/softmax_kernel.cpp
rename to src/operators/kernel/fpga/V1/softmax_kernel.cpp
index e36db57f4b4f18712df50b2b132cdd1032a41921..37c03e2404f761f3089adb852b94bef27bec1ce9 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
@@ -14,11 +14,9 @@ limitations under the License. */
 
 #ifdef SOFTMAX_OP
 
-#include "../softmax_kernel.h"
-#include "../central-arm-func/softmax_arm_func.h"
-#include "common/types.h"
-#include "fpga/api.h"
-#include "operators/math/softmax.h"
+#include "operators/kernel/softmax_kernel.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
+
 namespace paddle_mobile {
 namespace operators {
 
@@ -47,8 +45,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 }
 
 template <>
-void SoftmaxKernel<FPGA, float>::Compute(
-    const SoftmaxParam<FPGA> &param) const {
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
   Tensor *in_x = param.FloatInput();
   Tensor *out = param.Out();
 
diff --git a/src/operators/kernel/fpga/V2/concat_kernel.cpp b/src/operators/kernel/fpga/V2/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f9ab66d48489dbecae01f819bd607c582f6145b
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
+  auto inputs = param->Inputs();
+  auto out = param->Out();
+  auto image_num = inputs.size();
+  auto images_in =
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
+  auto channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+  auto aligned_channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
+
+  auto height = inputs[0]->dims()[2];
+  auto width = inputs[0]->dims()[3];
+  auto out_channel =
+      (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]);  // NOLINT
+  for (int i = 0; i < image_num; i++) {
+    auto input = inputs[i];
+    PADDLE_MOBILE_ENFORCE(
+        input->dims()[2] == height && input->dims()[3] == width,
+        "Image height & width should be unified");
+    images_in[i] = (half *)input->data<float>();  // NOLINT
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    aligned_channel_num[i] =
+        (uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
+    scales_in[i] = input->scale;
+  }
+  fpga::format_concat_output(out, (int)height, (int)width,  // NOLINT
+                             out_channel);
+
+  fpga::ConcatArgs concatArgs = {0};
+  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.images_in = images_in;
+  concatArgs.scales_in = scales_in;
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
+  concatArgs.scale_out = out->scale;
+  concatArgs.channel_num = channel_num;
+  concatArgs.aligned_channel_num = aligned_channel_num;
+  concatArgs.out_channel = out_channel;
+  concatArgs.height = (uint32_t)height;
+  concatArgs.width = (uint32_t)width;
+  param->SetFpgaArgs(concatArgs);
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
+  fpga::ComputeFPGAConcat(param.FpgaArgs());
+}
+template class ConcatKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c03daf7797dbc09ba85a4f4e32e983571d192df
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+
+  auto out = param->Output();
+
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+
+  return true;
+}
+
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8737554e6f8c343491656ca7659e1850d84ea246
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3c4443645e421ee0dce10f53914600fb7af75bf
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
+  }
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..070fce98b9e5f0c7055943447602dba8ae78c7c4
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBN_OP
+
+#include "operators/kernel/conv_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95ac74cbf87fe20ef419e748f8a8a04df20c98e3
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<FPGA, float>::Compute(
+    const FusionConvBNReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3284ddcdece3ab7fcf4fb4458a59d39c452ad1ce
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE_OP
+
+#include "operators/kernel/conv_transpose_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<FPGA, float>::Compute(
+    const ConvTransposeParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf3556609a4ec2476521a9b8e80192f71aef4f52
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DECONVRELU_OP
+
+#include "operators/kernel/deconv_relu_kernel.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void DeconvReluKernel<FPGA, float>::Compute(
+    const FusionDeconvReluParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/dropout_kernel.cpp b/src/operators/kernel/fpga/V2/dropout_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b990d46e0b90bf67eaf36bbf38238fd4432ace6
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/dropout_kernel.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "operators/kernel/dropout_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+
+template <>
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b5085f26123994effa319826d84f2f249c80847
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddKernel<FPGA, float>::Compute(
+    const ElementwiseAddParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..571987b3bf2a88c0d4ad648c7cb1966b538983a5
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddReluKernel<FPGA, float>::Init(
+    ElementwiseAddReluParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]);
+  fpga::format_fp16_ofm(out, aligned_channel_num);
+  auto out_ptr = out->mutable_data<float>();
+
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 0x3c00;  // =1
+  ewaddArgs.const1 = 0x3c00;  // =1
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddReluKernel<FPGA, float>::Compute(
+    const ElementwiseAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba869aaca7f3f5d5c598feb3837a59a3a738493b
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  auto input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4092307083bd38346b03857b8e9ec858795f3941
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]);
+  fpga::format_fp16_ofm(output, aligned_channel);
+  return true;
+}
+
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6e4591168b90cbe19b207cd9e77eaf5cd07de80
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/fetch_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+
+template class FetchKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..589c21d667f39e24e8f62abafd38ab30523dd2de
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  out->Resize(framework::make_ddim({1, channel, 1, 1}));
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  fpga::format_fc_data(filter, out, bs_ptr);
+
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/pool_kernel.cpp b/src/operators/kernel/fpga/V2/pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..480aca4eb318c18618db4f7bb498d21c10f857c8
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+
+#include "operators/kernel/pool_kernel.h"
+
+class PoolingArgs;
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
+  auto *input = const_cast<Tensor *>(param->Input());
+  auto input_ptr = input->data<float>();
+  Tensor *output = param->Output();
+  int aligned_channel_num =
+      fpga::get_aligned_channel_num((int)output->dims()[1]);  // NOLINT
+  fpga::format_fp16_ofm(output, aligned_channel_num);
+  auto output_ptr = output->mutable_data<float>();
+  vector<int> ksize = param->Ksize();
+  vector<int> strides = param->Strides();
+  vector<int> paddings = param->Paddings();
+  std::string pooling_type = param->PoolingType();
+
+  fpga::PoolingArgs poolArgs = {0};
+  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
+  poolArgs.kernel_reciprocal =
+      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
+  poolArgs.image.address = input_ptr;
+  poolArgs.image.channels = (uint32_t)input->dims()[1];
+  poolArgs.image.height = (uint32_t)input->dims()[2];
+  poolArgs.image.width = (uint32_t)input->dims()[3];
+  poolArgs.image.pad_height = (uint32_t)paddings[0];
+  poolArgs.image.pad_width = (uint32_t)paddings[1];
+  poolArgs.image.scale_address = input->scale;
+  poolArgs.output.address = output_ptr;
+  poolArgs.output.scale_address = output->scale;
+  poolArgs.kernel.height = (uint32_t)ksize[0];
+  poolArgs.kernel.width = (uint32_t)ksize[1];
+  poolArgs.kernel.stride_h = (uint32_t)strides[0];
+  poolArgs.kernel.stride_w = (uint32_t)strides[1];
+  param->SetFpgaArgs(poolArgs);
+  return true;
+}
+
+template <>
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
+  fpga::ComputeFpgaPool(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc3fbfd796fac693a319ed2ab24023b3ffb84863
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/kernel/slice_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
+  return true;
+}
+template <>
+void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbdb35b715b60b25079c007a74b8b1e901cc9a59
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "operators/kernel/softmax_kernel.h"
+#include "operators/kernel/central-arm-func/softmax_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>({1, input->dims()[1]});
+  fpga::format_fp32_ofm(float_input, 8);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),                           // NOLINT
+      fpga::get_aligned_channel_num((int)in_x->dims()[1]) *  // NOLINT
+          sizeof(float));
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46dd3a0f6f8819f6485243a445725554943ab2bf
--- /dev/null
+++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/kernel/tanh_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
+  return true;
+}
+
+template <>
+void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
index 06d3981bd23708aee982e38d82ba592d69733a89..b8086bc66fbef7ec952548a3cb863cfa031c504e 100644
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class FusionFcKernel
     : public framework::OpKernelBase<DeviceType, FusionFcParam<DeviceType>> {
  public:
-  void Compute(const FusionFcParam<DeviceType>& param) const;
+  void Compute(const FusionFcParam<DeviceType>& param);
   bool Init(FusionFcParam<DeviceType>* param);
 };
 
diff --git a/src/operators/kernel/gru_kernel.h b/src/operators/kernel/gru_kernel.h
index 6b02663bd0e2982bdb2480c54632d2a8da9f67fc..b03b2e3ecb514fdf962bde9c06620fa6e64934df 100644
--- a/src/operators/kernel/gru_kernel.h
+++ b/src/operators/kernel/gru_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class GruKernel
     : public framework::OpKernelBase<DeviceType, GruParam<DeviceType>> {
  public:
-  void Compute(const GruParam<DeviceType>& param) const;
+  void Compute(const GruParam<DeviceType>& param);
   bool Init(GruParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
index df93ea5abacda1a5291caa53dc5dae7ea2b5d710..b15eb68996a990f6bc770db6940be83a0eea0cbf 100644
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class Im2SequenceKernel
     : public framework::OpKernelBase<DeviceType, Im2SequenceParam<DeviceType>> {
  public:
-  void Compute(const Im2SequenceParam<DeviceType>& param) const;
+  void Compute(const Im2SequenceParam<DeviceType>& param);
   bool Init(Im2SequenceParam<DeviceType>* para);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/lookup_kernel.h b/src/operators/kernel/lookup_kernel.h
index 73f6cfcced078382b40526eae1f6560d7d168b97..8c29349e737b0fba95688e1ebb8fe893a29b2a4f 100644
--- a/src/operators/kernel/lookup_kernel.h
+++ b/src/operators/kernel/lookup_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class LookupKernel
     : public framework::OpKernelBase<DeviceType, LookupParam<DeviceType>> {
  public:
-  void Compute(const LookupParam<DeviceType>& param) const;
+  void Compute(const LookupParam<DeviceType>& param);
   bool Init(LookupParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index 164178f1dcc0ee2523fc9c5fdc4736c14a3e55ce..99dbfe2d658cde17e6399f8ea4bc5b945092cde5 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #ifdef LRN_OP
 #ifdef _OPENMP
 #include <omp.h>
@@ -173,7 +175,7 @@ template <typename DeviceType, typename T>
 class LrnKernel
     : public framework::OpKernelBase<DeviceType, LrnParam<DeviceType>> {
  public:
-  void Compute(const LrnParam<DeviceType> &param) const;
+  void Compute(const LrnParam<DeviceType> &param);
   bool Init(LrnParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp
index 50f6ef5f566347c089869c30b8f7534a4f8b6779..5d50ca9a7250f66f20b6bfaf0d93db18014d791c 100755
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -145,7 +145,7 @@ bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam<GPU_MALI>* param) {
 
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam<GPU_MALI>& param) const {
+    const BatchNormParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclBatchNormOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp
index 267c0101a8f66de3d508dbe5795c87ee5027a288..2fb05ab10eccf4e0dca9c74bbcc83067b438e981 100644
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -118,7 +118,7 @@ bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam<GPU_MALI>* param) {
 
 template <>
 void ConcatKernel<GPU_MALI, float>::Compute(
-    const ConcatParam<GPU_MALI>& param) const {
+    const ConcatParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclConcatOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp
index 74cace00dd2dead7a5d9ddfc76e2d48c67cccf89..427bcd596f71bf434ea155d04f192c5bdedfded5 100644
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -212,7 +212,7 @@ bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam<GPU_MALI>* param) {
 
 template <>
 void ConvAddKernel<GPU_MALI, float>::Compute(
-    const FusionConvAddParam<GPU_MALI>& param) const {
+    const FusionConvAddParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclConvAddOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp
index 7852e64990e5a2cd6f3d7e803e71c23c55aa7a27..7cca16274ecc7ae1707f8d5ed8faf2fde810ab30 100644
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -211,8 +211,7 @@ bool ConvKernel<GPU_MALI, float>::Init(ConvParam<GPU_MALI>* param) {
 }
 
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(
-    const ConvParam<GPU_MALI>& param) const {
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclConvOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp
index 5596476e1bb33ecc2b3122bf237090b099307156..3711a946b508c9ad71f59dd85f2e01c99bccc9e5 100644
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -34,7 +34,7 @@ bool ElementwiseAddKernel<GPU_MALI, float>::Init(
 
 template <>
 void ElementwiseAddKernel<GPU_MALI, float>::Compute(
-    const ElementwiseAddParam<GPU_MALI> &param) const {
+    const ElementwiseAddParam<GPU_MALI> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *Out = param.Out();
diff --git a/src/operators/kernel/mali/feed_kernel.cpp b/src/operators/kernel/mali/feed_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6af6c1a88b8031da4a23dad1d3269935ce81b9a8
--- /dev/null
+++ b/src/operators/kernel/mali/feed_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/feed_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
+  return true;
+}
+
+template <>
+void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+template class FeedKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/fetch_kernel.cpp b/src/operators/kernel/mali/fetch_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f74280cfb322b8135d99ca7fb7e2652a08588bb3
--- /dev/null
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/fetch_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FetchKernel<GPU_MALI, float>::Init(FetchParam<GPU_MALI> *param) {
+  return true;
+}
+
+template <>
+void FetchKernel<GPU_MALI, float>::Compute(const FetchParam<GPU_MALI> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+
+template class FetchKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
index c3197f38c6c6ee1a4f4f684c824a9a9e43d69d4f..5e59215834ce00e902deb19e54e149b3b4cfb8ac 100755
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -26,7 +26,7 @@ bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam<GPU_MALI> *param) {
 
 template <>
 void FusionFcKernel<GPU_MALI, float>::Compute(
-    const FusionFcParam<GPU_MALI> &param) const {
+    const FusionFcParam<GPU_MALI> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   const Tensor *input_z = param.InputZ();
diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp
index fc088f735c538bedc4d5c79593aa31c48acc4fc6..b46c9680d576ead3e7ab309c08894654a9fad04a 100644
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -127,8 +127,7 @@ bool LrnKernel<GPU_MALI, float>::Init(LrnParam<GPU_MALI>* param) {
 }
 
 template <>
-void LrnKernel<GPU_MALI, float>::Compute(
-    const LrnParam<GPU_MALI>& param) const {
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclLrnOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp
index a9e54dad2b51c595be4f68df3916a4803047617e..da69f5e6fe5a4ec95373011d360cd4d9e20a8a61 100644
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -27,8 +27,7 @@ bool MulKernel<GPU_MALI, float>::Init(MulParam<GPU_MALI> *param) {
 }
 
 template <>
-void MulKernel<GPU_MALI, float>::Compute(
-    const MulParam<GPU_MALI> &param) const {
+void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *out = param.Out();
diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp
index 33b3bd7017739144a519bfb1be247b4751883779..ec5d35a8f600d63a623b468c9c97c3540bf9c3f7 100644
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -195,8 +195,7 @@ bool PoolKernel<GPU_MALI, float>::Init(PoolParam<GPU_MALI>* param) {
 }
 
 template <>
-void PoolKernel<GPU_MALI, float>::Compute(
-    const PoolParam<GPU_MALI>& param) const {
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclPoolOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp
index 10b270800dee1a0ad8176da1f788100d29b60173..68bb52af3ab9b262218223d971b044edd759b347 100644
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -115,8 +115,7 @@ bool ReluKernel<GPU_MALI, float>::Init(ReluParam<GPU_MALI>* param) {
 }
 
 template <>
-void ReluKernel<GPU_MALI, float>::Compute(
-    const ReluParam<GPU_MALI>& param) const {
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclReluOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp
index 69c077e252162017cb477a000b5f17f5a968fc10..f98906c0a982c10896e75101eaa2732d75d6cdf4 100644
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -28,7 +28,7 @@ bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam<GPU_MALI> *param) {
 
 template <>
 void ReshapeKernel<GPU_MALI, float>::Compute(
-    const ReshapeParam<GPU_MALI> &param) const {
+    const ReshapeParam<GPU_MALI> &param) {
   const auto *input_x = param.InputX();
   const auto &input_x_dims = input_x->dims();
   auto *out = param.Out();
diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp
index d4f25c96cc47d7baa394645d4e0c84e0e3f7ad29..d6ce1ecb61c2790c68883231eb6b90dcde43a956 100644
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -113,7 +113,7 @@ bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam<GPU_MALI>* param) {
 
 template <>
 void SoftmaxKernel<GPU_MALI, float>::Compute(
-    const SoftmaxParam<GPU_MALI>& param) const {
+    const SoftmaxParam<GPU_MALI>& param) {
   std::cout << "init acl" << std::endl;
   AclSoftmaxOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h
index e441de4d4495b736aec248c0ef85191b32bfcbf9..8deb4a2cb74786257ddfc12c805c4a7d56589bbf 100644
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class MulKernel
     : public framework::OpKernelBase<DeviceType, MulParam<DeviceType>> {
  public:
-  void Compute(const MulParam<DeviceType> &param) const;
+  void Compute(const MulParam<DeviceType> &param);
   bool Init(MulParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h
index b1b20ddd81b395ea94ae62b1abf2fe861d9257db..6a4ac0c22941aa364f05e38c7abaf29948cd324b 100644
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -28,7 +28,7 @@ class MultiClassNMSKernel
     : public framework::OpKernelBase<DeviceType,
                                      MultiClassNMSParam<DeviceType>> {
  public:
-  void Compute(const MultiClassNMSParam<DeviceType>& param) const;
+  void Compute(const MultiClassNMSParam<DeviceType>& param);
   bool Init(MultiClassNMSParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/polygon_box_transform_kernel.h b/src/operators/kernel/polygon_box_transform_kernel.h
index d5baf32cc7dca0aee1eb0b7c13895e806f70320a..6ed003a4c794e7293ae3506909a779f95a677579 100644
--- a/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
@@ -27,7 +27,7 @@ class PolygonBoxTransformKernel
     : public framework::OpKernelBase<DeviceType,
                                      PolygonBoxTransformParam<DeviceType>> {
  public:
-  void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
+  void Compute(const PolygonBoxTransformParam<DeviceType>& param);
   bool Init(PolygonBoxTransformParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h
index 2be254444cc410fb95a94125cccb224ca9505545..ff80e0e44536d924026dbbe80a09677c069a8f6b 100644
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -26,7 +26,7 @@ using framework::OpKernelBase;
 template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam<DeviceType>> {
  public:
-  void Compute(const PoolParam<DeviceType> &param) const override;
+  void Compute(const PoolParam<DeviceType> &param);
   bool Init(PoolParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h
index f6c7c3ac7f139cf7eafe8843ef48e53c90292082..c043149243f21f2abceeed37c5d0e81a61e5059f 100644
--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
@@ -24,7 +24,7 @@ template <typename DeviceType, typename T>
 class PReluKernel
     : public framework::OpKernelBase<DeviceType, PReluParam<DeviceType>> {
  public:
-  void Compute(const PReluParam<DeviceType>& param) const;
+  void Compute(const PReluParam<DeviceType>& param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h
index 5640375483d42d52965986dab6795254bbf4b908..921d5901a8f24abab61f7aa94663385d91e597a7 100644
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -54,7 +54,7 @@ template <typename DeviceType, typename T>
 class PriorBoxKernel
     : public framework::OpKernelBase<DeviceType, PriorBoxParam<DeviceType>> {
  public:
-  void Compute(const PriorBoxParam<DeviceType>& param) const;
+  void Compute(const PriorBoxParam<DeviceType>& param);
   bool Init(PriorBoxParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h
index c55ca2182acd0f459c785f29d359ea9039a7350a..d864e00d9c80003d06d460f85b6fddda40e6d607 100644
--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class QuantizeKernel
     : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
  public:
-  void Compute(const QuantizeParam<DeviceType> &param) const;
+  void Compute(const QuantizeParam<DeviceType> &param);
   bool Init(QuantizeParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h
index b0c32791d626f14b0840ce1c8f3f12f02b403d97..48f47c2de6df8d3aa9461fba915fd1a6406d4b9f 100644
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ReluKernel
     : public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
  public:
-  void Compute(const ReluParam<DeviceType>& param) const;
+  void Compute(const ReluParam<DeviceType>& param);
   bool Init(ReluParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/reshape2_kernel.h b/src/operators/kernel/reshape2_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ab3cf72a29612249d0ff08e56ef60ca30d59a8
--- /dev/null
+++ b/src/operators/kernel/reshape2_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class Reshape2Kernel
+    : public framework::OpKernelBase<DeviceType, Reshape2Param<DeviceType>> {
+ public:
+  void Compute(const Reshape2Param<DeviceType>& param);
+  bool Init(Reshape2Param<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h
index 73eb63f797f34ec4eb2baec8c4ab79fafb06f0e2..a5405654874320cdfe3432d16d3a8c6358d2d8e1 100644
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -71,7 +71,7 @@ template <typename DeviceType, typename T>
 class ReshapeKernel
     : public framework::OpKernelBase<DeviceType, ReshapeParam<DeviceType>> {
  public:
-  void Compute(const ReshapeParam<DeviceType>& param) const;
+  void Compute(const ReshapeParam<DeviceType>& param);
   bool Init(ReshapeParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h
index 7102d2f4bc9bc64d53fa40697cf2b7a68d8be566..b25a0dcef5d291f03e4bb1a127eb0b592ee89055 100644
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
@@ -74,7 +74,7 @@ template <typename DeviceType, typename T>
 class ResizeKernel
     : public framework::OpKernelBase<DeviceType, ResizeParam<DeviceType>> {
  public:
-  void Compute(const ResizeParam<DeviceType> &param) const;
+  void Compute(const ResizeParam<DeviceType> &param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h
index 2da92d8d3c8b0d7867e7e6e628a04a853dd69464..a17e57652224992b2ee7127e6081804bf3253fb1 100644
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
@@ -24,7 +24,7 @@ template <typename DeviceType, typename T>
 class ScaleKernel
     : public framework::OpKernelBase<DeviceType, ScaleParam<DeviceType>> {
  public:
-  void Compute(const ScaleParam<DeviceType>& param) const;
+  void Compute(const ScaleParam<DeviceType>& param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/shape_kernel.h b/src/operators/kernel/shape_kernel.h
index 7caf3e427a4f3b469265248708a3090c52d1ca91..9d3c6e1701523acc43410fb0e3402b5679d4f19a 100644
--- a/src/operators/kernel/shape_kernel.h
+++ b/src/operators/kernel/shape_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class ShapeKernel
     : public framework::OpKernelBase<DeviceType, ShapeParam<DeviceType>> {
  public:
-  void Compute(const ShapeParam<DeviceType>& param) const;
+  void Compute(const ShapeParam<DeviceType>& param);
   bool Init(ShapeParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index e68f215b00aa2f9faba850853efe4896752a8f7b..db9fc3dd3cb1e6c0eb56cd5a14a173f5a031263c 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class SigmoidKernel
     : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
  public:
-  void Compute(const SigmoidParam<DeviceType>& param) const override;
+  void Compute(const SigmoidParam<DeviceType>& param);
   bool Init(SigmoidParam<DeviceType>* param);
 };
 
diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h
index 17f7fe4a9ebf5b78fc92c41abd4756a7bc6bff45..89dba51d9e11570bd4228adb075ee104b2094fd8 100644
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -24,7 +24,8 @@ template <typename DeviceType, typename T>
 class SliceKernel
     : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
  public:
-  void Compute(const SliceParam<DeviceType>& param) const {}
+  void Compute(const SliceParam<DeviceType>& param);
+  bool Init(SliceParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h
index 67bd9167e8c717355fc326d3025cde410ce66010..d7d7435fd5145e702de848872f93087188fd31fc 100644
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class SoftmaxKernel
     : public OpKernelBase<DeviceType, SoftmaxParam<DeviceType>> {
  public:
-  void Compute(const SoftmaxParam<DeviceType> &param) const override;
+  void Compute(const SoftmaxParam<DeviceType> &param);
   bool Init(SoftmaxParam<DeviceType> *param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/split_kernel.h b/src/operators/kernel/split_kernel.h
index 03a418de59606e42684c67ca3053fa8e39b07940..3a2c03dce718e650ebf9127044f0db44d9d5c9a5 100644
--- a/src/operators/kernel/split_kernel.h
+++ b/src/operators/kernel/split_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class SplitKernel
     : public framework::OpKernelBase<DeviceType, SplitParam<DeviceType>> {
  public:
-  void Compute(const SplitParam<DeviceType>& param) const;
+  void Compute(const SplitParam<DeviceType>& param);
   bool Init(SplitParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h
index ed337432e0fd4bf4035b67d4099379ce29918547..967d6f8307beb90254c431beaf324e891898d1a0 100644
--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
@@ -25,7 +25,7 @@ template <typename DeviceType, typename T>
 class SumKernel
     : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
  public:
-  void Compute(const SumParam<DeviceType> &param) const;
+  void Compute(const SumParam<DeviceType> &param);
   bool Init(SumParam<DeviceType> *param);
 };
 
diff --git a/src/operators/kernel/tanh_kernel.h b/src/operators/kernel/tanh_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..035f64f840b0aae8970f1aa284054a7984fc7ed6
--- /dev/null
+++ b/src/operators/kernel/tanh_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef TANH_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
+ public:
+  void Compute(const TanhParam<DeviceType>& param);
+  bool Init(TanhParam<DeviceType>* param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/transpose2_kernel.h b/src/operators/kernel/transpose2_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1fb186db09520bed6f891ef9381d96a06f648c9
--- /dev/null
+++ b/src/operators/kernel/transpose2_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class Transpose2Kernel
+    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
+ public:
+  void Compute(const Transpose2Param<DeviceType>& param);
+  bool Init(Transpose2Param<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h
index 56c41fd221e080a4db3b34fbd4ab208c9986c2a8..63ee6eb172ff691ff51dd3f74613cd3e412210bf 100644
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class TransposeKernel
     : public framework::OpKernelBase<DeviceType, TransposeParam<DeviceType>> {
  public:
-  void Compute(const TransposeParam<DeviceType>& param) const;
+  void Compute(const TransposeParam<DeviceType>& param);
   bool Init(TransposeParam<DeviceType>* param);
 };
 }  // namespace operators
diff --git a/src/operators/lookup_op.h b/src/operators/lookup_op.h
index 073e884e9157644670259b5acdb47443d2333e03..b5c3886cf46c9641e919aee32e7af30c6528309a 100644
--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
@@ -37,10 +37,6 @@ class LookupOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
                                       operators::LookupKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, LookupParam<DeviceType>,
-      operators::LookupKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index 26415a84aa96abdab91da7508080ce6a095aca62..3e1e92bfe6d9b888f100d07edaabfe0f8c6eaca5 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -35,10 +35,6 @@ class LrnOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
                                       operators::LrnKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, LrnParam<DeviceType>,
-      operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index dc699192a45a3fabe90ac2809f475bae5d5bbc10..b213f82351e03ddebc47efa672f0d21513a3098f 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -257,8 +257,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 
   const int h = static_cast<int>(input->dims()[2]);
   const int w = static_cast<int>(input->dims()[3]);
-  const int l = h;
-
+  //  const int l = h;
   const int batch_size = static_cast<int>(input->dims()[0]);
   const int c = static_cast<int>(input->dims()[1]);
   const int hxw = h * w;
@@ -271,7 +270,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
         vbias = vdupq_n_f32(bias_data[j]);
       }
 
-      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      int w_mid = w - 2;  // l=1->l_mid=-1,l=2->l_mid=0
       float w00 = filter_data_tmp[0];
       float w01 = filter_data_tmp[1];
       float w02 = filter_data_tmp[2];
@@ -283,39 +282,38 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       float w22 = filter_data_tmp[8];
 
       output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1];
-      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
-                           w20 * input_data[2 * l - 2] +
-                           w21 * input_data[2 * l - 1];
-      output_data[(l - 1) * l] =
-          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
-      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
-                               w01 * input_data[(l - 2) * (l + 1) + 1] +
-                               w10 * input_data[l * l - 2] +
-                               w11 * input_data[l * l - 1];
+                       w21 * input_data[w] + w22 * input_data[w + 1];
+      output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
+                           w20 * input_data[2 * w - 2] +
+                           w21 * input_data[2 * w - 1];
+      output_data[(h - 1) * w] =
+          w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
+          w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
+      output_data[h * w - 1] =
+          w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
+          w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
       if (if_bias) {
         output_data[0] += bias_data[j];
-        output_data[l - 1] += bias_data[j];
-        output_data[(l - 1) * l] += bias_data[j];
-        output_data[l * l - 1] += bias_data[j];
+        output_data[w - 1] += bias_data[j];
+        output_data[(h - 1) * w] += bias_data[j];
+        output_data[h * w - 1] += bias_data[j];
       }
 
-      for (int i = 1; i < l - 1; ++i) {
-        output_data[i * l] =
-            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
-            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
-
-        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
-                                     w01 * input_data[i * l + l - 1 - l] +
-                                     w10 * input_data[i * l + l - 1 - 1] +
-                                     w11 * input_data[i * l + l - 1] +
-                                     w20 * input_data[i * l + l - 1 + l - 1] +
-                                     w21 * input_data[i * l + l - 1 + l];
+      for (int i = 1; i < h - 1; ++i) {
+        output_data[i * w] =
+            w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
+            w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
+            w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
+
+        output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
+                                     w01 * input_data[i * w + w - 1 - w] +
+                                     w10 * input_data[i * w + w - 1 - 1] +
+                                     w11 * input_data[i * w + w - 1] +
+                                     w20 * input_data[i * w + w - 1 + w - 1] +
+                                     w21 * input_data[i * w + w - 1 + w];
         if (if_bias) {
-          output_data[i * l] += bias_data[j];
-          output_data[i * l + l - 1] += bias_data[j];
+          output_data[i * w] += bias_data[j];
+          output_data[i * w + w - 1] += bias_data[j];
         }
       }
 
@@ -325,15 +323,15 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
           tmp3, tmp4, tmp5, out0;
       in0 = vld1q_f32(input_tmp);
-      in2 = vld1q_f32(input_tmp + l);
-      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in2 = vld1q_f32(input_tmp + w);
+      const float *input_tmp_end = input_tmp + (h - 2) * w;
       in4 = vld1q_f32(input_tmp_end);
-      in6 = vld1q_f32(input_tmp_end + l);
-      int c_mid = l_mid;
+      in6 = vld1q_f32(input_tmp_end + w);
+      int c_mid = w_mid;
       auto output_ptr = output_data + 1;
       for (; c_mid > 3; c_mid -= 4) {
         in1 = vld1q_f32(input_tmp + 4);
-        in3 = vld1q_f32(input_tmp + l + 4);
+        in3 = vld1q_f32(input_tmp + w + 4);
 
         tmp0 = vextq_f32(in0, in1, 1);
         tmp1 = vextq_f32(in0, in1, 2);
@@ -352,7 +350,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
         vst1q_f32(output_ptr, out0);
 
         in5 = vld1q_f32(input_tmp_end + 4);
-        in7 = vld1q_f32(input_tmp_end + l + 4);
+        in7 = vld1q_f32(input_tmp_end + w + 4);
 
         tmp0 = vextq_f32(in4, in5, 1);
         tmp1 = vextq_f32(in4, in5, 2);
@@ -367,7 +365,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
         out0 = vmlaq_n_f32(out0, tmp3, w12);
         out0 = vaddq_f32(out0, vbias);
 
-        vst1q_f32(output_ptr + (l - 1) * l, out0);
+        vst1q_f32(output_ptr + (h - 1) * w, out0);
 
         // can optimize to each 8 stride.
         input_tmp += 4;
@@ -380,8 +378,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       }
 
       // top right pad
-      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+      float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
 
       tmp0 = vextq_f32(in0, pad0, 1);
       tmp1 = vextq_f32(in0, pad0, 2);
@@ -409,8 +407,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       }
 
       // bottom right pad
-      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
-      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+      float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
 
       tmp0 = vextq_f32(in4, pad2, 1);
       tmp1 = vextq_f32(in4, pad2, 2);
@@ -427,28 +425,28 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 
       for (int i = 0; i < c_mid; ++i) {
         if (i == 0) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
         }
         if (i == 1) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
         }
         if (i == 2) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
         }
       }
       // mid
 
-      for (int i = 0; i < l - 2; ++i) {
-        auto output_ptr = output_data + (i + 1) * l + 1;
-        input_tmp = input_data + i * l;
+      for (int i = 0; i < h - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * w + 1;
+        input_tmp = input_data + i * w;
         auto in0_tmp = vld1q_f32(input_tmp);
-        auto in2_tmp = vld1q_f32(input_tmp + l);
-        auto in4_tmp = vld1q_f32(input_tmp + l + l);
-        c_mid = l_mid;
+        auto in2_tmp = vld1q_f32(input_tmp + w);
+        auto in4_tmp = vld1q_f32(input_tmp + w + w);
+        c_mid = w_mid;
         for (; c_mid > 3; c_mid -= 4) {
           auto in1_tmp = vld1q_f32(input_tmp + 4);
-          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
-          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + w + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
 
           tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
           tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
@@ -477,9 +475,9 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
           in4_tmp = in5_tmp;
         }
 
-        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
-        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
 
         tmp0 = vextq_f32(in0_tmp, pad0, 1);
         tmp1 = vextq_f32(in0_tmp, pad0, 2);
@@ -539,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 
   const int hxw = input_height * input_width;
 
-  const int l = input_height;
-
+  //  const int l = input_height;
+  const int h = input_height;
+  const int w = input_width;
   float32x4_t vzero = vdupq_n_f32(0);
 
   for (int b = 0; b < batch_size; b++) {
@@ -626,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
       }
 
       output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1];
-      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
-                           w20 * input_data[2 * l - 2] +
-                           w21 * input_data[2 * l - 1];
-      output_data[(l - 1) * l] =
-          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
-      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
-                               w01 * input_data[(l - 2) * (l + 1) + 1] +
-                               w10 * input_data[l * l - 2] +
-                               w11 * input_data[l * l - 1];
+                       w21 * input_data[w] + w22 * input_data[w + 1];
+      output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
+                           w20 * input_data[2 * w - 2] +
+                           w21 * input_data[2 * w - 1];
+      output_data[(h - 1) * w] =
+          w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
+          w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
+      output_data[h * w - 1] =
+          w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
+          w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
       output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
-      output_data[l - 1] =
-          output_data[l - 1] * newscale_data[c] + newbias_data[c];
-      output_data[(l - 1) * l] =
-          output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c];
-      output_data[l * l - 1] =
-          output_data[l * l - 1] * newscale_data[c] + newbias_data[c];
+      output_data[w - 1] =
+          output_data[w - 1] * newscale_data[c] + newbias_data[c];
+      output_data[(h - 1) * w] =
+          output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c];
+      output_data[h * w - 1] =
+          output_data[h * w - 1] * newscale_data[c] + newbias_data[c];
 
       if (if_relu) {
         output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
-        output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
-        output_data[(l - 1) * l] =
-            output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
-        output_data[l * l - 1] =
-            output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
+        output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1];
+        output_data[(h - 1) * w] =
+            output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w];
+        output_data[h * w - 1] =
+            output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1];
       }
-      for (int i = 1; i < l - 1; ++i) {
-        output_data[i * l] =
-            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
-            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
-
-        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
-                                     w01 * input_data[i * l + l - 1 - l] +
-                                     w10 * input_data[i * l + l - 1 - 1] +
-                                     w11 * input_data[i * l + l - 1] +
-                                     w20 * input_data[i * l + l - 1 + l - 1] +
-                                     w21 * input_data[i * l + l - 1 + l];
-        output_data[i * l] =
-            output_data[i * l] * newscale_data[c] + newbias_data[c];
-        output_data[i * l + l - 1] =
-            output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c];
+      for (int i = 1; i < h - 1; ++i) {
+        output_data[i * w] =
+            w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
+            w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
+            w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
+
+        output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
+                                     w01 * input_data[i * w + w - 1 - w] +
+                                     w10 * input_data[i * w + w - 1 - 1] +
+                                     w11 * input_data[i * w + w - 1] +
+                                     w20 * input_data[i * w + w - 1 + w - 1] +
+                                     w21 * input_data[i * w + w - 1 + w];
+        output_data[i * w] =
+            output_data[i * w] * newscale_data[c] + newbias_data[c];
+        output_data[i * w + w - 1] =
+            output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c];
 
         if (if_relu) {
-          output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
-          output_data[i * l + l - 1] =
-              output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1];
+          output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w];
+          output_data[i * w + w - 1] =
+              output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1];
         }
       }
 
@@ -776,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 
         const int h = static_cast<int>(input->dims()[2]);
         const int w = static_cast<int>(input->dims()[3]);
-        const int l = h;
+//        const int l = h;
 
         const int batch_size = static_cast<int>(input->dims()[0]);
         const int c = static_cast<int>(input->dims()[1]);
@@ -792,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
             vnewbias = vdupq_n_f32(newbias_data[j]);
             vnewscale = vdupq_n_f32(newscale_data[j]);
 
-            int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+            int w_mid = w - 2;  // l=1->l_mid=-1,l=2->l_mid=0
             float w00 = filter_data_tmp[0];
             float w01 = filter_data_tmp[1];
             float w02 = filter_data_tmp[2];
@@ -804,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
             float w22 = filter_data_tmp[8];
 
             output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                             w21 * input_data[l] + w22 * input_data[l + 1];
-
-            output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
-       1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
-
-            output_data[(l - 1) * l] =
-                w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l +
-       1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
-            output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
-                                     w01 * input_data[(l - 2) * (l + 1) + 1] +
-                                     w10 * input_data[l * l - 2] +
-                                     w11 * input_data[l * l - 1];
+                             w21 * input_data[w] + w22 * input_data[w + 1];
+
+            output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w -
+       1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1];
+
+            output_data[(h - 1) * w] =
+                w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w +
+       1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
+            output_data[h * w - 1] = w00 * input_data[h*w-w-2] +
+                                     w01 * input_data[h*w-w-1] +
+                                     w10 * input_data[h * w - 2] +
+                                     w11 * input_data[h * w - 1];
             output_data[0] = output_data[0] * newscale_data[j] +
-       newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
-       newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
-                output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
-            output_data[l * l - 1] =
-                output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+       newbias_data[j]; output_data[w - 1] = output_data[w - 1] *
+       newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] =
+                output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j];
+            output_data[h * w - 1] =
+                output_data[h * w - 1] * newscale_data[j] + newbias_data[j];
 
             if (if_relu) {
               output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
-              output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
-       1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
-       output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
-       < 0 ? 0 : output_data[l * l - 1];
+              output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w -
+       1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 :
+       output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1]
+       < 0 ? 0 : output_data[h * w - 1];
             }
-            for (int i = 1; i < l - 1; ++i) {
-              output_data[i * l] =
-                  w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
-       + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
-       input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
-       l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
-       * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
-       input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
-       * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
-       * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
-                  output_data[i * l + l - 1] * newscale_data[j] +
+            for (int i = 1; i < h - 1; ++i) {
+              output_data[i * w] =
+                  w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1]
+       + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 *
+       input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i *
+       w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i
+       * w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 *
+       input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21
+       * input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w]
+       * newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] =
+                  output_data[i * w + w - 1] * newscale_data[j] +
        newbias_data[j];
 
               if (if_relu) {
-                output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
-       * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
-       output_data[i * l + l - 1];
+                output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i
+       * w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 :
+       output_data[i * w + w - 1];
               }
             }
 
@@ -855,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 
             float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
        tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
-       vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
-       2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
-       l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
+       vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h -
+       2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
+       w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid >
        3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
-       vld1q_f32(input_tmp + l + 4);
+       vld1q_f32(input_tmp + w + 4);
 
               tmp0 = vextq_f32(in0, in1, 1);
               tmp1 = vextq_f32(in0, in1, 2);
@@ -880,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
               vst1q_f32(output_ptr, out0);
 
               in5 = vld1q_f32(input_tmp_end + 4);
-              in7 = vld1q_f32(input_tmp_end + l + 4);
+              in7 = vld1q_f32(input_tmp_end + w + 4);
 
               tmp0 = vextq_f32(in4, in5, 1);
               tmp1 = vextq_f32(in4, in5, 2);
@@ -897,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
               if (if_relu) {
                 out0 = vmaxq_f32(out0, vzero);
               }
-              vst1q_f32(output_ptr + (l - 1) * l, out0);
+              vst1q_f32(output_ptr + (h - 1) * w, out0);
 
               // can optimize to each 8 stride.
               input_tmp += 4;
@@ -910,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
             }
 
             // top right pad
-            float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
-            float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+            float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
+            float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
 
             tmp0 = vextq_f32(in0, pad0, 1);
             tmp1 = vextq_f32(in0, pad0, 2);
@@ -941,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
             }
 
             // bottom right pad
-            float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
-            float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+            float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
+            float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
 
             tmp0 = vextq_f32(in4, pad2, 1);
             tmp1 = vextq_f32(in4, pad2, 2);
@@ -961,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
             }
             for (int i = 0; i < c_mid; ++i) {
               if (i == 0) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
               }
               if (i == 1) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
               }
               if (i == 2) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
               }
             }
             // mid
 
 
-            for (int i = 0; i < l - 2; ++i) {
-              auto output_ptr = output_data + (i + 1) * l + 1;
-              input_tmp = input_data + i * l;
+            for (int i = 0; i < h - 2; ++i) {
+              auto output_ptr = output_data + (i + 1) * w + 1;
+              input_tmp = input_data + i * w;
               auto in0_tmp = vld1q_f32(input_tmp);
-              auto in2_tmp = vld1q_f32(input_tmp + l);
-              auto in4_tmp = vld1q_f32(input_tmp + l + l);
-              c_mid = l_mid;
+              auto in2_tmp = vld1q_f32(input_tmp + w);
+              auto in4_tmp = vld1q_f32(input_tmp + w + w);
+              c_mid = w_mid;
               for (; c_mid > 3; c_mid -= 4) {
                 auto in1_tmp = vld1q_f32(input_tmp + 4);
-                auto in3_tmp = vld1q_f32(input_tmp + l + 4);
-                auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+                auto in3_tmp = vld1q_f32(input_tmp + w + 4);
+                auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
 
                 tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
                 tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
@@ -1014,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                 in4_tmp = in5_tmp;
               }
 
-              float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
-              float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
-              float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+              float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
+              float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
+              float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
 
               tmp0 = vextq_f32(in0_tmp, pad0, 1);
               tmp1 = vextq_f32(in0_tmp, pad0, 2);
@@ -1060,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 #endif
 }
 
+/// w!=h not fix
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, const Tensor *new_scale,
                                    const Tensor *new_bias, bool if_relu) {
@@ -1275,7 +1274,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
   const int in_l = in_h;
   const int inhxw = in_h * in_w;
   const int outhxw = out_h * out_w;
-  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  /// todo : fix if_pad when w != h
+  const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0;
   const int batch_size = static_cast<int>(input->dims()[0]);
   const int c = static_cast<int>(input->dims()[1]);
   const float *input_row_ptr;
@@ -1366,7 +1367,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
         elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
         elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
 
-        if (!if_pad) {
+        if (!if_pad_b) {
           elewise_res1 =
               vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
           elewise_res0 =
@@ -1381,9 +1382,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
         if ((w4 != w_times)) {
           vst1q_f32(output_row_ptr, res3);
         } else {
-          if (out_l - 2 - w_times * 3 == 1) {
+          if (out_w - 2 - w_times * 3 == 1) {
             vst1q_lane_f32(output_row_ptr, res3, 0);
-          } else if (out_l - 2 - w_times * 3 == 2) {
+          } else if (out_w - 2 - w_times * 3 == 2) {
             vst1q_lane_f32(output_row_ptr, res3, 0);
             vst1q_lane_f32(output_row_ptr + 1, res3, 1);
           }
@@ -1393,64 +1394,65 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
       }
 
       output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
-                           input_const[in_l] * w21 +
-                           input_const[in_l + 1] * w22;
+                           input_const[in_w] * w21 +
+                           input_const[in_w + 1] * w22;
 
-      out2in_mid = (out_l - 1) * 2;
-      output_data_tmp[out_l - 1] =
+      out2in_mid = (out_w - 1) * 2;
+      output_data_tmp[out_w - 1] =
           w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
           w20 * input_const[out2in_mid + in_w - 1] +
           w21 * input_const[out2in_mid + in_w] +
-          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
 
-      out2in_mid = (out_l - 1) * 2 * in_w;
+      out2in_mid = (out_h - 1) * 2 * in_w;
 
-      output_data_tmp[out_l * (out_l - 1)] =
+      output_data_tmp[out_w * (out_h - 1)] =
           w01 * input_const[out2in_mid - in_w] +
           w02 * input_const[out2in_mid - in_w + 1] +
           w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
-          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+          (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
 
-      output_data_tmp[out_l * out_l - 1] =
+      output_data_tmp[out_h * out_w - 1] =
           w00 * input_const[out2in_mid - in_w - 1] +
           w01 * input_const[out2in_mid - in_w] +
           w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
-          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
-                          w21 * input_const[out2in_mid + in_w] +
-                          w02 * input_const[out2in_mid - in_w + 1] +
-                          w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] +
+                            w21 * input_const[out2in_mid + in_w]) +
+          (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1]) +
+          (1 - if_pad_r) * (1 - if_pad_b) * w22 *
+              input_const[out2in_mid + in_w + 1];
       if (if_bias) {
         output_data_tmp[0] += bias_data[j];
-        output_data_tmp[out_l - 1] += bias_data[j];
-        output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
-        output_data_tmp[out_l * out_l - 1] += bias_data[j];
+        output_data_tmp[out_w - 1] += bias_data[j];
+        output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
+        output_data_tmp[out_h * out_w - 1] += bias_data[j];
       }
       for (int i = 1; i < out_h - 1; i++) {
         out2in_mid = i * 2 * in_w;
-        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+        output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
                                      w02 * input_const[out2in_mid - in_w + 1] +
                                      w11 * input_const[out2in_mid] +
                                      w12 * input_const[out2in_mid + 1] +
                                      w21 * input_const[out2in_mid + in_w] +
                                      w22 * input_const[out2in_mid + in_w + 1];
 
-        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
-        output_data_tmp[i * out_l + out_l - 1] =
+        out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
+        output_data_tmp[i * out_w + out_w - 1] =
             w00 * input_const[out2in_mid - in_w - 1] +
             w01 * input_const[out2in_mid - in_w] +
             w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
             w20 * input_const[out2in_mid + in_w - 1] +
             w21 * input_const[out2in_mid + in_w] +
-            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
-                            w12 * input_const[out2in_mid + 1] +
-                            w22 * input_const[out2in_mid + in_w + 1]);
+            (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] +
+                              w12 * input_const[out2in_mid + 1] +
+                              w22 * input_const[out2in_mid + in_w + 1]);
         if (if_bias) {
-          output_data_tmp[i * out_l] += bias_data[j];
-          output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
+          output_data_tmp[i * out_w] += bias_data[j];
+          output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
         }
       }
       filter_data_tmp += 9;
@@ -1657,11 +1659,13 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
   const int in_w = static_cast<int>(input->dims()[3]);
   const int out_h = static_cast<int>(output->dims()[2]);
   const int out_w = static_cast<int>(output->dims()[3]);
-  const int out_l = out_h;
-  const int in_l = in_h;
+  //  const int out_l = out_h;
+  //  const int in_l = in_h;
   const int inhxw = in_h * in_w;
   const int outhxw = out_h * out_w;
-  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  /// todo : fix if_pad when w != h
+  const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
+  const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0;
   const int batch_size = static_cast<int>(input->dims()[0]);
   const int c = static_cast<int>(input->dims()[1]);
   const int w_times = (out_w - 2) / 3;
@@ -1755,7 +1759,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
         elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
         elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
 
-        if (!if_pad) {
+        if (!if_pad_b) {
           elewise_res1 =
               vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
           elewise_res0 =
@@ -1775,9 +1779,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
           vst1q_lane_f32(output_row_ptr + 1, res3, 1);
           vst1q_lane_f32(output_row_ptr + 2, res3, 2);
         } else {
-          if (out_l - 2 - w_times * 3 == 1) {
+          if (out_w - 2 - w_times * 3 == 1) {
             vst1q_lane_f32(output_row_ptr, res3, 0);
-          } else if (out_l - 2 - w_times * 3 == 2) {
+          } else if (out_w - 2 - w_times * 3 == 2) {
             vst1q_lane_f32(output_row_ptr, res3, 0);
             vst1q_lane_f32(output_row_ptr + 1, res3, 1);
           }
@@ -1787,90 +1791,91 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
       }
 
       output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
-                           input_const[in_l] * w21 +
-                           input_const[in_l + 1] * w22;
+                           input_const[in_w] * w21 +
+                           input_const[in_w + 1] * w22;
 
-      out2in_mid = (out_l - 1) * 2;
-      output_data_tmp[out_l - 1] =
+      out2in_mid = (out_w - 1) * 2;
+      output_data_tmp[out_w - 1] =
           w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
           w20 * input_const[out2in_mid + in_w - 1] +
           w21 * input_const[out2in_mid + in_w] +
-          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
 
-      out2in_mid = (out_l - 1) * 2 * in_w;
+      out2in_mid = (out_h - 1) * 2 * in_w;
 
-      output_data_tmp[out_l * (out_l - 1)] =
+      output_data_tmp[out_w * (out_h - 1)] =
           w01 * input_const[out2in_mid - in_w] +
           w02 * input_const[out2in_mid - in_w + 1] +
           w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
-          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+          (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
 
-      output_data_tmp[out_l * out_l - 1] =
+      output_data_tmp[out_h * out_w - 1] =
           w00 * input_const[out2in_mid - in_w - 1] +
           w01 * input_const[out2in_mid - in_w] +
           w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
-          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
-                          w21 * input_const[out2in_mid + in_w] +
-                          w02 * input_const[out2in_mid - in_w + 1] +
-                          w12 * input_const[out2in_mid + 1] +
-                          w22 * input_const[out2in_mid + in_w + 1]);
+          (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] +
+                            w21 * input_const[out2in_mid + in_w]) +
+          (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1]) +
+          (1 - if_pad_r) * (1 - if_pad_b) * w22 *
+              input_const[out2in_mid + in_w + 1];
       output_data_tmp[0] =
           output_data_tmp[0] * newscale_data[j] + newbias_data[j];
-      output_data_tmp[out_l - 1] =
-          output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
-      output_data_tmp[out_l * (out_l - 1)] =
-          output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
+      output_data_tmp[out_w - 1] =
+          output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_w * (out_h - 1)] =
+          output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] +
           newbias_data[j];
-      output_data_tmp[out_l * out_l - 1] =
-          output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
+      output_data_tmp[out_h * out_w - 1] =
+          output_data_tmp[out_h * out_w - 1] * newscale_data[j] +
           newbias_data[j];
       if (if_relu) {
         output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
-        output_data_tmp[out_l - 1] =
-            output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
-        output_data_tmp[out_l * (out_l - 1)] =
-            output_data_tmp[out_l * (out_l - 1)] < 0
+        output_data_tmp[out_w - 1] =
+            output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1];
+        output_data_tmp[out_w * (out_h - 1)] =
+            output_data_tmp[out_w * (out_h - 1)] < 0
                 ? 0
-                : output_data_tmp[out_l * (out_l - 1)];
-        output_data_tmp[out_l * out_l - 1] =
-            output_data_tmp[out_l * out_l - 1] < 0
+                : output_data_tmp[out_w * (out_h - 1)];
+        output_data_tmp[out_h * out_w - 1] =
+            output_data_tmp[out_h * out_w - 1] < 0
                 ? 0
-                : output_data_tmp[out_l * out_l - 1];
+                : output_data_tmp[out_h * out_w - 1];
       }
       for (int i = 1; i < out_h - 1; i++) {
         out2in_mid = i * 2 * in_w;
-        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+        output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
                                      w02 * input_const[out2in_mid - in_w + 1] +
                                      w11 * input_const[out2in_mid] +
                                      w12 * input_const[out2in_mid + 1] +
                                      w21 * input_const[out2in_mid + in_w] +
                                      w22 * input_const[out2in_mid + in_w + 1];
 
-        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
-        output_data_tmp[i * out_l + out_l - 1] =
+        out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
+        output_data_tmp[i * out_w + out_w - 1] =
             w00 * input_const[out2in_mid - in_w - 1] +
             w01 * input_const[out2in_mid - in_w] +
             w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
             w20 * input_const[out2in_mid + in_w - 1] +
             w21 * input_const[out2in_mid + in_w] +
-            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
-                            w12 * input_const[out2in_mid + 1] +
-                            w22 * input_const[out2in_mid + in_w + 1]);
-        output_data_tmp[i * out_l] =
-            output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
-        output_data_tmp[i * out_l + out_l - 1] =
-            output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
+            (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] +
+                              w12 * input_const[out2in_mid + 1] +
+                              w22 * input_const[out2in_mid + in_w + 1]);
+        output_data_tmp[i * out_w] =
+            output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j];
+        output_data_tmp[i * out_w + out_w - 1] =
+            output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] +
             newbias_data[j];
         if (if_relu) {
-          output_data_tmp[i * out_l] =
-              output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
-          output_data_tmp[i * out_l + out_l - 1] =
-              output_data_tmp[i * out_l + out_l - 1] < 0
+          output_data_tmp[i * out_w] =
+              output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w];
+          output_data_tmp[i * out_w + out_w - 1] =
+              output_data_tmp[i * out_w + out_w - 1] < 0
                   ? 0
-                  : output_data_tmp[i * out_l + out_l - 1];
+                  : output_data_tmp[i * out_w + out_w - 1];
         }
       }
     }
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 44621ba99a92a3ed456b8d7d0959e3580662d910..d3e6de3134ff91f47c66c927194a5ba688e931b0 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3230,6 +3230,8 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
   int L1 = 64 / max_threads * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3255,7 +3257,7 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3284,12 +3286,10 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
@@ -3307,8 +3307,13 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
       float *local_A = packedA + MC * KC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
       (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
-      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                          &C(i, 0), ldc, relu, bias + i);
+      if (bias == nullptr) {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, bias + i);
+      }
     }
   } else {
 #pragma omp parallel for
@@ -3347,6 +3352,8 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
   int L1 = 64 / max_threads * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3372,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3400,12 +3407,10 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
@@ -3475,6 +3480,8 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
   int L1 = 8 * 1024;
   KC = k;
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   if (m > n) {
     // 对 A 分块
     MC = L1 / (KC * sizeof(float));
@@ -3500,7 +3507,7 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
+    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB);
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
   } else {
@@ -3528,12 +3535,10 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
 
     packedA = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
+    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA);
     packedB = static_cast<float *>(
         paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
   }
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
   packedC = static_cast<float *>(
       paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
 
diff --git a/src/operators/math/gru_compute.cpp b/src/operators/math/gru_compute.cpp
index 8ebf92059b5f5205b3169a6992039d3f050b3b4b..9e77f572c53bc2ba9be57f5edbd2b4bf85f5305e 100644
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -30,20 +30,34 @@ struct GRUUnitFunctor<CPU, T> {
                       const ActivationType active_gate) {
     Gemm gemm;
     if (value.prev_out_value) {
+#ifdef _OPENMP
+      gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
+                     value.prev_out_value, frame_size, value.gate_weight,
+                     frame_size * 2, 1, value.gate_value, frame_size * 3, false,
+                     nullptr);
+#else
       gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
                  value.prev_out_value, frame_size, value.gate_weight,
                  frame_size * 2, 1, value.gate_value, frame_size * 3, false,
                  nullptr);
+#endif
     }
 
     forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
                          batch_size, active_gate);
 
     if (value.prev_out_value) {
+#ifdef _OPENMP
+      gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
+                     value.reset_output_value, frame_size, value.state_weight,
+                     frame_size, 1, value.gate_value + frame_size * 2,
+                     frame_size * 3, false, nullptr);
+#else
       gemm.Sgemm(batch_size, frame_size, frame_size, 1,
                  value.reset_output_value, frame_size, value.state_weight,
                  frame_size, 1, value.gate_value + frame_size * 2,
                  frame_size * 3, false, nullptr);
+#endif
     }
 
     forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 8d460688bbedf3d2a4e5dadaa5eebb1ca709cf05..9449ad70819f2ea114fac8848f6ee023871d47f2 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -117,7 +117,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
        (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
   int fill = isize % 2;
   if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
-      dilation[0] == 1 && im_height > 2) {
+      dilation[0] == 1 && im_height > 2 && im_height == im_width) {
     for (int c = 0; c < im_channels; ++c) {
       int oosize = osize * osize;
       int nk4 = osize / 4;
@@ -289,7 +289,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
       im_data += isize * isize;
     }
   } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
-             im_height > 2) {
+             im_height > 2 && im_height == im_width) {
     for (int c = 0; c < im_channels; ++c) {
       int oosize = osize * osize;
       int nk4 = osize / 4;
@@ -676,7 +676,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
 
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
-
     for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
       for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
         for (int channel = 0; channel < im_channels; ++channel) {
@@ -688,7 +687,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
                  ++filter_col_idx) {
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
-
               int col_offset =
                   ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                     channel) *
@@ -696,7 +694,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
                    filter_row_idx) *
                       filter_width +
                   filter_col_idx;
-
               int im_offset = (channel * im_height + im_row_offset) * im_width +
                               im_col_offset;
               col_data[col_offset] =
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 9dc3dbafed990de2f4057d98a2accdd8ce2fd7db..88bf866b73f6f06d28f6e1868031ae1a25b9b31c 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -58,7 +58,7 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
         const float *in_ptr1 = input_data + i * input_batch_stride +
                                c * input_channel_stride + ph * input_width;
         const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph + 1 >= input_height) {
+        if (ph != input_height && ph + 1 >= input_height) {
           in_ptr2 = static_cast<float *>(
               paddle_mobile::memory::Alloc(sizeof(float) * input_width));
           memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
@@ -122,19 +122,30 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
 #endif
 
         if (_w2 != 0) {
-          in_ptr1 += 16 * w1 + 4 * w2;
-          in_ptr2 += 16 * w1 + 4 * w2;
-          out_ptr += 8 * w1 + 2 * w2;
+          in_ptr1 = input_data + i * input_batch_stride +
+                    c * input_channel_stride + ph * input_width + 16 * w1 +
+                    4 * w2;
+          in_ptr2 = in_ptr1 + input_width;
+          out_ptr = output_data + i * output_batch_stride +
+                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
+                    2 * w2;
           if (_w2 == 1) {
             *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
           } else if (_w2 == 2) {
-            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            in_ptr1++;
+            in_ptr2++;
             float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
             *out_ptr = (temp > temp1) ? temp : temp1;
           } else if (_w2 == 3) {
-            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
-            float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
-            *out_ptr++ = (temp > temp1) ? temp : temp1;
+            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            in_ptr1++;
+            in_ptr2++;
+            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            in_ptr1++;
+            in_ptr2++;
+            *out_ptr = (temp > temp1) ? temp : temp1;
+            out_ptr++;
             *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
           }
         }
@@ -173,7 +184,7 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
   int w2 = _w1 / 4;
   int _w2 = _w1 % 4;
 
-  float quarter = 1 / 4;
+  float quarter = 0.25;
   for (int i = 0; i < batch_size; ++i) {
     for (int c = 0; c < output_channels; ++c) {
       for (int ph = 0; ph < input_height; ph += 2) {
@@ -250,25 +261,32 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
 #endif
 
         if (_w2 != 0) {
-          in_ptr1 += 16 * w1 + 4 * w2;
-          in_ptr2 += 16 * w1 + 4 * w2;
-          out_ptr += 8 * w1 + 2 * w2;
+          in_ptr1 = input_data + i * input_batch_stride +
+                    c * input_channel_stride + ph * input_width + 16 * w1 +
+                    4 * w2;
+          in_ptr2 = in_ptr1 + input_width;
+          out_ptr = output_data + i * output_batch_stride +
+                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
+                    2 * w2;
           if (_w2 == 1) {
             *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
           } else if (_w2 == 2) {
             float temp = 0;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
             temp += *in_ptr1;
             temp += *in_ptr2;
-            *out_ptr = 0.5 * temp;
+            in_ptr1++;
+            in_ptr2++;
+            temp += *in_ptr1;
+            temp += *in_ptr2;
+            *out_ptr = 0.25 * temp;
           } else if (_w2 == 3) {
             float temp = 0;
             temp += *in_ptr1++;
             temp += *in_ptr2++;
             temp += *in_ptr1++;
             temp += *in_ptr2++;
-            *out_ptr++ = 0.5 * temp;
+            *out_ptr = 0.25 * temp;
+            out_ptr++;
             *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
           }
         }
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index dba88c93969014f2ad0d2636b4141c734dbc2ed5..9c23d99e60f6c7f38f372cbe2d221ae3c1a58592 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -141,13 +141,21 @@ class SoftmaxFuntor<CPU, T> {
  public:
   void operator()(const framework::Tensor *X, framework::Tensor *Y) {
     const DDim dDim = X->dims();
+    int dim1 = dDim[dDim.size() - 1];
+    int dim0 = X->numel() / dim1 / dDim[0];
+    framework::DDim matrix_shape = {dim0, dim1};
     for (int i = 0; i < dDim[0]; ++i) {
       framework::Tensor sub_X = X->Slice(i, i + 1);
       framework::Tensor sub_Y = Y->Slice(i, i + 1);
-
+      sub_X.Resize(matrix_shape);
+      sub_Y.Resize(matrix_shape);
+      for (int j = 0; j < dim0; j++) {
+        framework::Tensor sub_x = sub_X.Slice(j, j + 1);
+        framework::Tensor sub_y = sub_Y.Slice(j, j + 1);
 #ifdef __ARM_NEON
-      SoftmaxCacl(&sub_X, &sub_Y);
+        SoftmaxCacl(&sub_x, &sub_y);
 #endif
+      }
     }
   }
 };
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index 5cd174db07973461fe699242a2013d9c4ea78732..51e828202e8da2080f014eff2bd60472dd873884 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -35,10 +35,6 @@ class MulOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
                                       operators::MulKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, MulParam<DeviceType>,
-      operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index 4919ec69b6b5b1a702760f46ddbfc77b16c7875e..059974ab214004bcd1423514c85353da9a9bb6b8 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -40,10 +40,6 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
             DeviceType, MultiClassNMSParam<DeviceType>,
             operators::MultiClassNMSKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, MultiClassNMSParam<DeviceType>,
-      operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index fb45cc9ac7fd60471f406f5208f906f000338011..0862ed9b69faa079b2bf841b014f451f9b44e855 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -23,8 +23,17 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#ifdef PADDLE_MOBILE_FPGA
-#include "fpga/api.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
 #endif
 
 namespace paddle_mobile {
@@ -48,6 +57,17 @@ struct DtypeTensorTrait {
   typedef framework::Tensor rtype;
 };
 
+#ifdef PADDLE_MOBILE_CL
+template <>
+struct DtypeTensorTrait<GPU_CL> {
+  // This is the type we obtained in variable.
+  typedef framework::CLImage gtype;
+  // This type will be the parent class type
+  // or the same type.
+  typedef framework::CLImage rtype;
+};
+#endif
+
 class OpParam {
  protected:
   template <typename T>
@@ -243,6 +263,12 @@ class OpParam {
     return GetVarValue<T>("Y", outputs, scope);
   }
 
+  template <typename T>
+  static T *OutputXShapeFrom(const VariableNameMap &outputs,
+                             const Scope &scope) {
+    return GetVarValue<T>("XShape", outputs, scope);
+  }
+
   template <typename T>
   static T *OutputBoxesFrom(const VariableNameMap &outputs,
                             const Scope &scope) {
@@ -403,6 +429,13 @@ class ConvParam : public OpParam {
 
   const int &Groups() const { return groups; }
 
+#ifdef PADDLE_MOBILE_CL
+  int Offset() const { return offset_; }
+
+  int SetOffset(int in_offset) { offset_ = in_offset; }
+
+#endif
+
  private:
   RType *input_;
   mutable RType *output_;
@@ -412,6 +445,20 @@ class ConvParam : public OpParam {
   vector<int> dilations_;
   mutable enum ExecMode exec_mode_;
   int groups;
+
+#ifdef PADDLE_MOBILE_CL
+  int offset_;
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::SplitConvArgs fpga_conv_args;
+
+ public:
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
@@ -556,15 +603,6 @@ class MulParam : OpParam {
   GType *out_;
   int x_num_col_dims_;
   int y_num_col_dims_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -722,6 +760,14 @@ class BatchNormParam : OpParam {
 
   const string &DataFormat() const { return data_format_; }
 
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
  private:
   RType *input_x_;
   RType *output_y_;
@@ -733,6 +779,8 @@ class BatchNormParam : OpParam {
   float momentum_;
   bool is_test_;
   string data_format_;
+  RType *new_bias_;
+  RType *new_scale_;
 };
 #endif
 
@@ -1041,18 +1089,18 @@ class FeedParam : public OpParam {
 
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    auto var = scope->Var("batch_size");
+            const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    auto var = scope.FindVar("batch_size");
     batch_size = var->GetValue<int>();
   }
-  const GType *InputX() const { return input_x_; }
+  const LoDTensor *InputX() const { return input_x_; }
   GType *Out() const { return out_; }
   const int BatchSize() const { return batch_size; }
 
  private:
-  GType *input_x_;
+  LoDTensor *input_x_;
   GType *out_;
   int batch_size;
 };
@@ -1066,14 +1114,19 @@ class FetchParam : public OpParam {
   FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
              const AttributeMap &attrs, const Scope &scope) {
     input_x_ = InputXFrom<GType>(inputs, scope);
-    out_ = OutFrom<GType>(outputs, scope);
+    out_ = OutFrom(outputs, scope);
   }
+
   const RType *InputX() const { return input_x_; }
-  RType *Out() const { return out_; }
+  Tensor *Out() const { return out_; }
+
+  static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
+    return GetVarValue<LoDTensor>("Out", outputs, scope);
+  }
 
  private:
   RType *input_x_;
-  RType *out_;
+  Tensor *out_;
 };
 
 #ifdef FILL_CONSTANT_OP
@@ -1139,6 +1192,37 @@ class TransposeParam : public OpParam {
 };
 #endif
 
+#ifdef TRANSPOSE2_OP
+template <typename Dtype>
+class Transpose2Param : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
+    axis_ = GetAttr<vector<int>>("axis", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  RType *Out() const { return out_; }
+
+  RType *OutputXShape() const { return output_xshape_; }
+
+  const vector<int> &Axis() const { return axis_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+  RType *output_xshape_;
+  vector<int> axis_;
+};
+#endif
+
 #ifdef LOOKUP_OP
 template <typename Dtype>
 class LookupParam : public OpParam {
@@ -1246,6 +1330,49 @@ class ReshapeParam : public OpParam {
 };
 #endif
 
+#ifdef RESHAPE2_OP
+template <typename Dtype>
+class Reshape2Param : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_shape_ = InputShapeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
+    shape_ = GetAttr<vector<int>>("shape", attrs);
+    if (HasAttr("inplace", attrs)) {
+      inplace_ = GetAttr<bool>("inplace", attrs);
+    } else {
+      inplace_ = false;
+    }
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  const GType *InputShape() const { return input_shape_; }
+
+  GType *Out() const { return out_; }
+
+  GType *OutputXShape() const { return output_xshape_; }
+
+  const vector<int> &Shape() const { return shape_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+ private:
+  GType *input_x_;
+  GType *input_shape_;
+  GType *out_;
+  GType *output_xshape_;
+  vector<int> shape_;
+  bool inplace_;
+};
+#endif
+
 #ifdef SCALE_OP
 template <typename Dtype>
 class ScaleParam : public OpParam {
@@ -1380,13 +1507,13 @@ class ResizeParam : public OpParam {
  * @b op 层实例化好这个 param 传递给 kernel 层使用
  * */
 template <typename Dtype>
-class ReluParam : public OpParam {
+class ReluParamBase : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
-  ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, const Scope &scope) {
+  ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
     input_x_ = InputXFrom<GType>(inputs, scope);
     out_ = OutFrom<GType>(outputs, scope);
   }
@@ -1399,6 +1526,46 @@ class ReluParam : public OpParam {
   RType *input_x_;
   RType *out_;
 };
+
+template <typename Dtype>
+class ReluParam : public ReluParamBase<Dtype> {
+ public:
+  using ReluParamBase<Dtype>::ReluParamBase;
+};
+
+#ifdef PADDLE_MOBILE_CL
+template <>
+class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
+ public:
+  using ReluParamBase<GPU_CL>::ReluParamBase;
+  framework::CLImage &getMidImage() { return midImage; }
+
+ private:
+  framework::CLImage midImage;
+};
+#endif
+
+#endif
+
+#ifdef TANH_OP
+template <typename Dtype>
+class TanhParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+};
 #endif
 
 #ifdef PRELU_OP
@@ -1509,15 +1676,6 @@ class FusionConvAddParam : public ConvParam<Dtype> {
   RType *bias_;
   int axis_;
   RType *output_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 
 template <typename Dtype>
@@ -1564,15 +1722,6 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
   RType *output_;
   RType *alpha_;
   std::string mode_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -1622,15 +1771,6 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
   std::string keyOutput_;
   std::string keyX1_;
   std::string keyY1_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -1697,15 +1837,6 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
   bool is_test_;
   RType *new_bias_;
   RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -1783,15 +1914,6 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
   std::string keyBNY_;
   std::string keyX_;
   std::string keyY_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -1850,15 +1972,6 @@ class FusionConvBNParam : public ConvParam<Dtype> {
   bool is_test_;
   RType *new_bias_;
   RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -1925,15 +2038,6 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
   bool is_test_;
   RType *new_bias_;
   RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -2051,15 +2155,6 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
   bool is_test_;
   RType *new_bias_;
   RType *new_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
 };
 #endif
 
@@ -2080,9 +2175,9 @@ class Im2SequenceParam : public OpParam {
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
   }
 
-  const RType *Input() const { return input_x_; }
+  const GType *Input() const { return input_x_; }
 
-  RType *Output() const { return out_; }
+  GType *Output() const { return out_; }
 
   const vector<int> &Kernels() const { return kernels_; }
 
@@ -2091,8 +2186,8 @@ class Im2SequenceParam : public OpParam {
   const vector<int> &Paddings() const { return paddings_; }
 
  private:
-  RType *input_x_;
-  RType *out_;
+  GType *input_x_;
+  GType *out_;
   vector<int> kernels_;
   vector<int> strides_;
   vector<int> paddings_;
@@ -2168,9 +2263,24 @@ class ConvTransposeParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::DeconvArgs fpga_conv_args;
+
+ public:
+  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 #endif
 
+#ifdef FUSION_DECONVRELU_OP
+template <typename Dtype>
+using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
+#endif
+
 #ifdef GRU_OP
 template <typename Dtype>
 class GruParam : public OpParam {
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index dd23059ea01a332aff45137b7f7ed4c9f6c2e1bb..241f278ec0c5dd10e103b3ab1aa6f296323eebce 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #ifdef POOL_OP
 
-#include "pool_op.h"
+#include "operators/pool_op.h"
+#include <vector>
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 
@@ -68,5 +69,8 @@ REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(pool2d, ops::PoolOp);
+#endif
 
 #endif
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 9880599ce5fc71048d6a555b3fa4848c5d7a8220..8f3957e29ee0802576f604900f8d15f86a864d53 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -38,9 +38,6 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
       : OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
                            operators::PoolKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-  using OperatorWithKernel<
-      DeviceType, PoolParam<DeviceType>,
-      operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  private:
diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h
index af33476b7298a5728a6ef944506d55f422a2fa8c..5d0458f896941ece4208ca4b4931db189b4f436e 100644
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -38,10 +38,6 @@ class PReluOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
                                       operators::PReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, PReluParam<DeviceType>,
-      operators::PReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index f7e02802ae82368319d5e9095c73afcac295b4fc..f7e26430a0536cde011de14f670a9f46b8f517c1 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -40,9 +40,6 @@ class PriorBoxOp : public framework::OperatorWithKernel<
                                       operators::PriorBoxKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<
-      DeviceType, PriorBoxParam<DeviceType>,
-      operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 933e1cfce064d63664ebc35b7ac331d4f32b74b9..d6d83475ee7879f8bc967439dac2094df12c8617 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -39,5 +39,10 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(relu, ops::ReluOp);
+#endif
 
 #endif
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 584c9da3c80c4e3e9e69fdb70a602cdd486e26b8..1c94a7f6d71484d0a4bd14e89d8518f6e73a660b 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -41,10 +41,6 @@ class ReluOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ReluParam<DeviceType>,
                                       operators::ReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ReluParam<DeviceType>,
-      operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1623076570d466fc53f885374060c5e744365ed
--- /dev/null
+++ b/src/operators/reshape2_op.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/reshape2_op.h"
+#include <vector>
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void Reshape2Op<Dtype, T>::InferShape() const {
+  auto &shape = this->param_.Shape();
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto out_dims = ValidateShape(shape, input_x_dims);
+  this->param_.Out()->Resize(out_dims);
+  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
+  for (int i = 0; i < input_x_dims.size(); ++i) {
+    xshape_dims[i + 1] = input_x_dims[i];
+  }
+  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
+#endif
+
+#endif
diff --git a/src/operators/reshape2_op.h b/src/operators/reshape2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a06c2b9b90233b6ad0bacb6176f4cc274ff1cc0
--- /dev/null
+++ b/src/operators/reshape2_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/reshape2_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class Reshape2Op : public framework::OperatorWithKernel<
+                       DeviceType, Reshape2Param<DeviceType>,
+                       operators::Reshape2Kernel<DeviceType, T>> {
+ public:
+  Reshape2Op(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, Reshape2Param<DeviceType>,
+                                      operators::Reshape2Kernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, Reshape2Param<DeviceType>,
+      operators::Reshape2Kernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index 214007545844e19cf698c6294416a6501a595b58..8ceb157d28764de469e5de5108ad483387ba8ca9 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,5 +38,8 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
+#endif
 
 #endif
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index a7347ddd8c6511224d4422f66eac71e61bf48549..3109303ff0e6007d0dbec133102924ff7bb30306 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -39,10 +39,6 @@ class ReshapeOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
                                       operators::ReshapeKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ReshapeParam<DeviceType>,
-      operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h
index c0b38bb1cf4048af4b07d05f28a88a5ac8056ea3..954b3a82f8d2b5ccba242045c3d5e0f28553d484 100644
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
@@ -38,10 +38,6 @@ class ResizeOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
                                       operators::ResizeKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ResizeParam<DeviceType>,
-      operators::ResizeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/scale_op.h b/src/operators/scale_op.h
index 4c5f5e620f25bef88533e80cdd78b243fef9bc70..56265259fe3a10feda67cc5c5732b2ba44e0730e 100644
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
@@ -38,10 +38,6 @@ class ScaleOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
                                       operators::ScaleKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ScaleParam<DeviceType>,
-      operators::ScaleKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/shape_op.h b/src/operators/shape_op.h
index 37b4fef1f4667051e51adbd96d6ada36bf36b647..116751c48e9ca3cc9ec936b1bcbaa72b6950bbc5 100644
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
@@ -38,10 +38,6 @@ class ShapeOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
                                       operators::ShapeKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ShapeParam<DeviceType>,
-      operators::ShapeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index 62fc65dce1025fff629dd81ea4a7f797ded1a1d6..7150a8a473e4cb1dba7230d63799bd263ef19812 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -36,11 +36,6 @@ class SigmoidOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, SigmoidParam<DeviceType>,
                                       operators::SigmoidKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SigmoidParam<DeviceType>,
-      operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
-
   void InferShape() const override;
 };
 
diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp
index ac6c434c9450905931abeb395b294bed64c036b0..5704737902c03c476907ab527495b46c52567ed5 100644
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
 #endif
-
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
+#endif
 #endif
diff --git a/src/operators/slice_op.h b/src/operators/slice_op.h
index 6bcb6fa0b9e88cefb3c88dfc096e1073ad261c1b..c45061696577dbe6948fb9cab7edebbaf8e15f2f 100644
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
@@ -38,10 +38,6 @@ class SliceOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
                                       operators::SliceKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SliceParam<DeviceType>,
-      operators::SliceKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index e85edc69c3291c794f2eeb8119b91b2926c4d870..e605864706a6c59a35205b3072dd432b009c5d1f 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -36,5 +36,8 @@ REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp);
+#endif
 
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index cee5993174a02f610c1de0ad47ca6b73477fd946..422213feeaf2bc2301832de2f9c69827342a5062 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -36,11 +36,6 @@ class SoftmaxOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
                                       operators::SoftmaxKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SoftmaxParam<DeviceType>,
-      operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
-
   void InferShape() const override;
 
  private:
diff --git a/src/operators/split_op.h b/src/operators/split_op.h
index d37bf7a0f93005a4c95e7e82c7c90313fda409cb..fc733c18520b971107e00003b3107b8c0aa9b36d 100644
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
@@ -38,10 +38,6 @@ class SplitOp : public framework::OperatorWithKernel<
       : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
                                       operators::SplitKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SplitParam<DeviceType>,
-      operators::SplitKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 }  // namespace operators
diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..454cdfa26942eda225a811317e907b1989bcf61b
--- /dev/null
+++ b/src/operators/tanh_op.cpp
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#include "operators/tanh_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void TanhOp<DeviceType, T>::InferShape() const {
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
+#endif
+
+#endif
diff --git a/src/operators/tanh_op.h b/src/operators/tanh_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b0e4e9a07ae4fd3e4885790d5832065ed3eb49
--- /dev/null
+++ b/src/operators/tanh_op.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TANH_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/tanh_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class TanhOp : public framework::OperatorWithKernel<
+                   DeviceType, TanhParam<DeviceType>,
+                   operators::TanhKernel<DeviceType, T>> {
+ public:
+  TanhOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, TanhParam<DeviceType>,
+                                      operators::TanhKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/transpose2_op.cpp b/src/operators/transpose2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64d07991f60b4057e3d2841afa1bfe6483f31a88
--- /dev/null
+++ b/src/operators/transpose2_op.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+
+#include <vector>
+
+#include "common/enforce.h"
+#include "operators/transpose2_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void Transpose2Op<Dtype, T>::InferShape() const {
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto axis = this->param_.Axis();
+
+  size_t x_dims_size = input_x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
+                        "input_dims must "
+                        "be equal to the axis_size. ")
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_MOBILE_ENFORCE(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        "Each element of Attribute axis should be a unique value "
+        "range from 0 to (dims - 1), "
+        "where the dims is the axis's size");
+  }
+  framework::DDim out_dims(input_x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = input_x_dims[axis[i]];
+  }
+  this->param_.Out()->Resize(out_dims);
+  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
+  for (int i = 0; i < input_x_dims.size(); ++i) {
+    xshape_dims[i + 1] = input_x_dims[i];
+  }
+  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
+#endif
+
+#endif  // TRANSPOSE_OP
diff --git a/src/operators/transpose2_op.h b/src/operators/transpose2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1339cc59e0c71a232eddd5dcef47f62994b80da
--- /dev/null
+++ b/src/operators/transpose2_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class Transpose2Op : public framework::OperatorWithKernel<
+                         DeviceType, Transpose2Param<DeviceType>,
+                         operators::Transpose2Kernel<DeviceType, T>> {
+ public:
+  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs,
+               const framework::AttributeMap &attrs,
+               std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Transpose2Param<DeviceType>,
+            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, Transpose2Param<DeviceType>,
+      operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index 7e5f72058d4e06f5b5b1fef81ade0350ea78f21c..eb98ce235491632aa1149acc158552955c2c1e0c 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -40,10 +40,6 @@ class TransposeOp : public framework::OperatorWithKernel<
             DeviceType, TransposeParam<DeviceType>,
             operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
                                                        attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, TransposeParam<DeviceType>,
-      operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8f24392f7a7acf8dd7529619c4e950dd3598f1d5..38dc540e206ade4adb1427bf2121475217b2d730 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -62,15 +62,21 @@ if (CON GREATER -1)
 
 endif ()
 
-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
-
-#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-#    target_link_libraries(test-resnet paddle-mobile)
     set(FOUND_MATCH ON)
+endif ()
 
+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
+    target_link_libraries(test-pe paddle-mobile)
+    set(FOUND_MATCH ON)
 endif ()
 
 list(FIND NET "mobilenetssd" CON)
@@ -184,6 +190,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-transpose-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-transpose2-op paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-multiclassnms-op paddle-mobile)
@@ -200,6 +210,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-reshape-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-reshape2-op paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-relu-op paddle-mobile)
@@ -330,6 +344,14 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
     target_link_libraries(test-fssd paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-mobilenetgpu paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-yologpu paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
     target_link_libraries(test-multi-process paddle-mobile)
@@ -338,5 +360,9 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
     target_link_libraries(test-benchmark paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
+    target_link_libraries(test-eng paddle-mobile)
+
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 60f1856bb9294c6f9b4bd5cfb7d44f984c6f0794..970eff2400a1806c4db96cb6112c4d64dfc7eb3b 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 
 #include "common/log.h"
+#include "framework/executor.h"
 #include "framework/op_registry.h"
-#include "io/executor.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -29,9 +29,9 @@ limitations under the License. */
 #include "operators/softmax_op.h"
 #include "operators/transpose_op.h"
 
-using paddle_mobile::Executor;
 using paddle_mobile::framework::BlockDesc;
 using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Executor;
 using paddle_mobile::framework::LoDTensor;
 using paddle_mobile::framework::OpDesc;
 using paddle_mobile::framework::Program;
diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp
index 5d1a5828b36b3d9ed371a271af6db82657ff1596..44b9f4971bbd5cc69e1f663ae71e27e69c31a04b 100644
--- a/test/fpga/test_concat_op.cpp
+++ b/test/fpga/test_concat_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/concat_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::FPGA> loader;
   auto program = loader.Load(g_googlenet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/fpga/test_pe.cpp b/test/fpga/test_pe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5f2708b9e628af80433be4e7ccbb205d3fcd6f6
--- /dev/null
+++ b/test/fpga/test_pe.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#include "fpga/V2/filter.h"
+
+namespace fpga = paddle_mobile::fpga;
+
+static const uint32_t N = 64;
+static const uint32_t C = 3;
+static const uint32_t H = 224;
+static const uint32_t W = 224;
+static const uint32_t G = 1;
+
+fpga::DataType input_type = fpga::DATA_TYPE_FP32;
+fpga::DataType output_type = fpga::DATA_TYPE_FP16;
+
+void* ifm = nullptr;
+void* ofm = nullptr;
+void* filter = nullptr;
+void* ifm_scale = nullptr;
+void* ofm_scale = nullptr;
+void* filter_scale = nullptr;
+
+int ifm_size = 0, ofm_size = 0;
+
+void format_data() {
+  ifm_scale = fpga::fpga_malloc(8);
+  ofm_scale = fpga::fpga_malloc(8);
+  int ifm_channel = fpga::filter::calc_aligned_channel(C);
+  int ofm_channel = fpga::filter::calc_aligned_channel(N);
+  int num = fpga::filter::calc_aligned_num(N, C);
+  DLOG << "ifm_channel = " << ifm_channel;
+  DLOG << "ofm_channel = " << ofm_channel;
+  DLOG << "aligned_num = " << num;
+  ifm_size = ifm_channel * H * W;
+  ofm_size = ofm_channel * H * W;
+  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
+  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
+  memset(ifm, 0, ifm_size * sizeof(float));
+  memset(ofm, 0, ofm_size * sizeof(int16_t));
+
+  for (int h = 0; h < H; h++) {
+    for (int w = 0; w < W; w++) {
+      for (int c = 0; c < C; c++) {
+        int index = h * W * ifm_channel + w * ifm_channel + c;
+        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
+        // DLOG << index << ":" << ((float *) ifm)[index];
+      }
+    }
+  }
+  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
+  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
+}
+
+void print_fp16(int16_t* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << fpga::fp16_2_fp32(ptr[i]);
+  }
+}
+
+void print_fp32(float* ptr, int total_size, int num) {
+  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
+  int stride = total_size / num;
+  for (int i = 0; i < total_size; i += stride) {
+    DLOG << ptr[i];
+  }
+}
+
+void test_bypass() {
+  fpga::BypassArgs args;
+  args.input_data_type = input_type;
+  args.output_data_type = output_type;
+  args.image.address = ifm;
+  args.image.height = H;
+  args.image.width = W;
+  args.image.channels = C;
+  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
+  args.output.address = ofm;
+  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
+  fpga::PerformBypass(args);
+}
+
+int main() {
+  paddle_mobile::fpga::open_device();
+  format_data();
+  DLOG << "format data done";
+  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
+  DLOG << "print input done";
+  test_bypass();
+  DLOG << "test done";
+  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
+  std::cout << "Computation done" << std::endl;
+  return 0;
+}
+
+#endif
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index 8a6a9dc8af836010695c6c6dc30e81ba224c7ffd..4d05328179fa2acc771e08a6dfddea4f770d9780 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -13,7 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 #include "../test_include.h"
-#include "fpga/api.h"
+
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+
 void readStream(std::string filename, float *buf) {
   std::ifstream in;
   in.open(filename, std::ios::in);
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 64fa42658be6b39fabe9bb26296a426949d31197..3d1b6af935b2f3e7f0c60f5c0cbbcc696f6aeba2 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include <string>
 
 #include "../test_helper.h"
-#include "io/loader.h"
+#include "framework/loader.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
 
-  //  auto program = loader.Load(g_googlenet, true);
   //  auto program = loader.Load(g_mobilenet_ssd, true);
 
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params", false);
+  //  auto program = loader.Load(std::string(g_ocr) + "/model",
+  //                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
+
   return 0;
 }
diff --git a/test/framework/test_load_memory_inference_api.cpp b/test/framework/test_load_memory_inference_api.cpp
index 05d51910172547c6dab7adc8231663be55c916bf..5b2773f8f1a21c3b9253b34fc5c18cd64ece27e7 100644
--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
@@ -55,11 +55,11 @@ static char *Get_binary_data(std::string filename) {
 paddle_mobile::PaddleMobileConfig GetConfig() {
   paddle_mobile::PaddleMobileConfig config;
   config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kCPU;
+  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
   const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
       std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_genet_combine) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
+  auto model_path = std::string(g_mobilenet_combined) + "/model";
+  auto params_path = std::string(g_mobilenet_combined) + "/params";
   memory_pack->model_size =
       ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
   std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 3cae963eca048da221d69c4c336dd4fdfecbb584..0392020789096e921865afed0b0fc51fa5999c6b 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "../test_helper.h"
+#include "framework/loader.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/loader.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
   auto program = loader.Load(g_mobilenet_ssd, true);
   paddle_mobile::framework::ProgramOptimize optimize;
diff --git a/test/net/test_eng.cpp b/test/net/test_eng.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b99a6c927a44ca4032b352731b3971b63cf26b4f
--- /dev/null
+++ b/test/net/test_eng.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_eng) + "/model",
+                         std::string(g_eng) + "/params", true, false, 1,
+                         true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 1, 48, 400};
+    LoDTensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 1, 48, 400}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    //   预热十次
+    for (int i = 0; i < 1; ++i) {
+      paddle_mobile.PredictLod(input_tensor);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 1; ++i) {
+      paddle_mobile.PredictLod(input_tensor);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index a61df31e39c653e346c467c6ca17d5df3e08673e..c3379df609fc1e18b8c3545e25849f8a7ff0461b 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -28,8 +28,9 @@ int main() {
   bool optimize = true;
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
-    auto time2 = time();
-    std::cout << "load cost: " << time_diff(time1, time2) << "ms\n";
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
     std::vector<float> input;
     std::vector<float> output;
     std::vector<int64_t> dims{1, 3, 224, 224};
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 4ed7d3b756cfef9554028e1d33f4dd86bf58e4b8..5cce53e866df0530d6c8e1f35bc7159ba6e5ba9b 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,14 +19,15 @@ limitations under the License. */
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
+  auto time1 = paddle_mobile::time();
   //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
   //                     std::string(g_mobilenet_detect) + "/params", true);
 
   auto isok = paddle_mobile.Load(g_mobilenet, true);
   if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
+              << std::endl;
 
     std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
@@ -42,14 +43,14 @@ int main() {
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
     }
-    auto time3 = time();
+    auto time3 = paddle_mobile::time();
     for (int i = 0; i < 10; ++i) {
       auto vec_result = paddle_mobile.Predict(input, dims);
     }
     DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
+    auto time4 = paddle_mobile::time();
+    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10
+              << "ms" << std::endl;
   }
 
   std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e410baf77616584995f1e3687b47ca0af337a231
--- /dev/null
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../../src/common/types.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = paddle_mobile::time();
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+
+  auto isok =
+      paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model",
+                         std::string(g_mobilenet_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0215ded59e5f74f0c103d4b51abe06b487bd50ab
--- /dev/null
+++ b/test/net/test_yologpu.cpp
@@ -0,0 +1,189 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <thread>
+#include "../../src/common/types.h"
+#include "../../src/io/paddle_test_inference_api.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+void t1() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile_gpu;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile_cpu;
+  paddle_mobile::PaddleTester<paddle_mobile::CPU> paddle_test_cpu;
+  paddle_mobile::PaddleTester<paddle_mobile::GPU_CL> paddle_test_gpu;
+  printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime());
+  std::string path = "/data/local/tmp/bin";
+  printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path));
+  //    paddle_mobile.SetThreadNum(4);
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin");
+#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile_gpu.Load(std::string(g_yolo_mul) + "/model",
+                                     std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result;
+    //            = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile_gpu.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    //    auto time3 = paddle_mobile::time();
+
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+
+    //    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+}
+
+void t2() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
+                                 std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result;
+    //            = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    //    auto time3 = paddle_mobile::time();
+
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+
+    //    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+}
+
+void t3() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  //#ifdef PADDLE_MOBILE_CL
+  //  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+  //#endif
+  auto time1 = paddle_mobile::time();
+  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
+                                 std::string(g_yolo_mul) + "/params", true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+
+    //    auto time3 = paddle_mobile::time();
+
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+
+    //    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+}
+
+int main() {
+  //  std::thread th1(t1);
+  //      std::thread th2(t2);
+  //  std::thread th3(t3);
+  std::thread th1(t1);
+  //  th1.join();
+  //      th2.join();
+  //  th3.join();
+  th1.join();
+  return 0;
+}
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index 4ccad8c1512036c2400a09575b3775e75b26acce..c027d4bd31d5ff41f42e9cd333618f8630aad5d9 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/batchnorm_op.h"
@@ -127,7 +125,7 @@ template class TestBatchNormOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run BatchNormOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   /// input x (4,10,2,2)
diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp
index 92cba3995c866c67c00491ad5cc38fb094594ad3..721e691107c2c2d0117fdedecf219484556c9541 100644
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/box_coder_op.h"
 
@@ -115,7 +114,7 @@ template class TestBoxCoderOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run BoxCoderOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   paddle_mobile::framework::Tensor priorbox;
diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp
index edaa4ce1ddba251886c90262895333b0a56c3a07..1a347a9c37a96f3c31506d0b45f95e05b64292ff 100644
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/concat_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_googlenet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_conv_add_relu_op.cpp b/test/operators/test_conv_add_relu_op.cpp
index 987f52cd62f91b3bc00cc1ef49bd21913e288d75..f170719218b98d341985a61ca6160884afe4ad3b 100644
--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/fusion_conv_add_relu_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //  ../models/image_classification_resnet.inference.model
   auto program = loader.Load(g_googlenet, true);
 
diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp
index bd2aad19eda896bad3da8a47f5b70b1a923dc1a7..77c76eedc5690412dfee95dd11e8a3fe9ed6ecbe 100644
--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/depthwise_conv_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //  ../models/image_classification_resnet.inference.model
   auto program = loader.Load(g_mobilenet_ssd);
 
diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp
index 0a5e9f7e92701e748df51078b21eb46eec90599d..3922b216cfc6ecf55be251ded02c0c064e2c3ffc 100644
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_resnet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp
index cfac83eff7a012d52d47f96e088bd8519603cadc..e1030852976a68db827ebb7629caf8bb199a2456 100644
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/elementwise_sub_op.h"
@@ -106,7 +104,7 @@ template class TestElementwiseSubOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run ElementwiseSub Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_ocr) + "/model",
                              std::string(g_ocr) + "/params");
 
diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp
index b099217d1641eb221b3d0d86d780fb6ecfa929bd..9dc7bb13884efb8860a6670e088bd5af67c1f0ea 100644
--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/fill_constant_op.h"
 
@@ -95,7 +94,7 @@ template class TestFillConstantOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run FillConstant Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_ocr) + "/model",
                              std::string(g_ocr) + "/params");
 
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
index 7764d95ed72da613459233bd55ddcffdc444318f..347bcb40a6156a576842af34920bde838dd83cd8 100644
--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/fusion_conv_add_bn_relu_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //  ../models/image_classification_resnet.inference.model
   auto program = loader.Load(g_mobilenet, true);
 
diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp
index a23bde45cb74f0f75e655821b15e66b1cef4c081..a8ec4883aab4218aa526e7b90267998754d1eb30 100644
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include <framework/program/program-optimize/program_optimize.h>
 #include "../test_include.h"
 #include "operators/fusion_fc_op.h"
@@ -114,7 +112,7 @@ template class TestFcOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run Fc Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
   auto program = loader.Load(g_googlenet);
   paddle_mobile::framework::ProgramOptimize optimize;
diff --git a/test/operators/test_gru_op.cpp b/test/operators/test_gru_op.cpp
index 52ab8b54d709391ea263b74a395a635ce50a18af..f2ce833661bfd1b3d751a7ac2d54cfb70114a6c6 100644
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/gru_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_nlp);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp
index b45e437e12f95cd9f7050247fc03a152246d8122..3cd172d99bb1bb9c24f035d501dce362476909c2 100644
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/im2sequence_op.h"
@@ -62,7 +60,6 @@ class TestIm2SequenceOp {
     Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
     auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
-
     Variable *output = scope->Var("im2sequence_0.tmp_0");
     auto *output_tensor = output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({2, 12});
@@ -102,7 +99,7 @@ template class TestIm2SequenceOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run Im2Sequence Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_eng) + "/model",
                              std::string(g_eng) + "/params");
 
diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp
index d4d9f8da802fc0f5f885a3b2e81cba695776c29e..5d1ac9b4dd7225112ace8bfbb13f926502c77b94 100644
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/lrn_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_googlenet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp
index d1b98d4965fd182ab1adc480279f38cea53974be..32c2c1f6bd682fdac8d9b81155b8aa044b87232b 100644
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/multiclass_nms_op.h"
 
@@ -31,14 +30,12 @@ class TestMultiClassNMSOp {
 
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
     for (auto block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
       for (auto op : ops) {
         if (op->Type() == "multiclass_nms" &&
             op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " attr size: " << op->GetAttrMap().size();
           DLOG << " inputs size: " << op->GetInputs().size();
           DLOG << " outputs size: " << op->GetOutputs().size();
           DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
@@ -55,14 +52,6 @@ class TestMultiClassNMSOp {
                << op->GetAttrMap().at("nms_top_k").Get<int>();
           DLOG << " score_threshold : "
                << op->GetAttrMap().at("score_threshold").Get<float>();
-          //                            DLOG << " variances : " <<
-          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
-          //                            DLOG << " aspect_ratios : " <<
-          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
-          //                            DLOG << " min_sizes : " <<
-          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
-          //                            DLOG << " max_sizes : " <<
-          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
           std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
               std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
                   op->Type(), op->GetInputs(), op->GetOutputs(),
@@ -88,16 +77,12 @@ class TestMultiClassNMSOp {
     auto *output_tensor = output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({1917, 6});
 
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
     std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
     out_tensor.reset(output_tensor);
 
     predict(t1, t2, 0);
 
     return out_tensor;
-    // return outvars_tensor;
   }
 
  private:
@@ -126,9 +111,8 @@ template class TestMultiClassNMSOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run MulticlassNMS Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
-
   paddle_mobile::framework::Tensor inputx1;
   SetupTensor<float>(&inputx1, {1, 2, 4}, static_cast<float>(0),
                      static_cast<float>(1));
diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp
index a71177ddbd8e4d8b0f204fd6ec9c948882499cbd..2347f06989153b9ce5994fa0e4d09673ab2698f1 100644
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/polygon_box_transform_op.h"
 
@@ -97,7 +96,7 @@ template class TestPolygonBoxTransformOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run PolygonBoxTransform Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_ocr));
 
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index 2daecd7b4c1a50c612bc784c801208d2e6f31482..09470caf82eb90df56f7aa79b6873c2a6b94fbef 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/pool_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_googlenet));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
diff --git a/test/operators/test_prelu_op.cpp b/test/operators/test_prelu_op.cpp
index e93d8732d18496721b24cfba1df296250169f8b2..f98c9904ae3799cb863142b0fcb332c74c91ba98 100644
--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/prelu_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_resnet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp
index 8c697a9a7982f05b71caa5bb5f4d12e50dc9d418..424f2443f8627002cff0adc19600f9aba50ad0fb 100644
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/prior_box_op.h"
 
@@ -126,7 +125,7 @@ template class TestPriorBoxOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run PriorBoxOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   /// input x (1,3,300,300)
diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp
index fad0d0c30a126cc2730e4aa8b87364eee9fc8209..542d3d18f6a383c1e03962ba845b39c04a51631b 100644
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/relu_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(g_resnet);
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
diff --git a/test/operators/test_reshape2_op.cpp b/test/operators/test_reshape2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0d51f984a617ea37713e5830adf6b5d248fb434
--- /dev/null
+++ b/test/operators/test_reshape2_op.cpp
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/reshape2_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestReshape2Op {
+ public:
+  explicit TestReshape2Op(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "reshape2") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " inputs size: " << op->GetInputs().size();
+          VariableNameMap inputs = op->GetInputs();
+          for (VariableNameMap::iterator it = inputs.begin();
+               it != inputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          VariableNameMap outputs = op->GetOutputs();
+          for (VariableNameMap::iterator it = outputs.begin();
+               it != outputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          input_var_name = op->Input("X")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::Reshape2Op<Dtype, float>> op_ptr =
+              std::make_shared<operators::Reshape2Op<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t, 0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestReshape2Op<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Reshape2 Test";
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 4, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+
+  paddle_mobile::framework::TestReshape2Op<paddle_mobile::CPU> testReshape2Op(
+      program);
+
+  auto output = testReshape2Op.predict(input);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp
index 3541151d8a1a286527e715f402df381d2efc094c..ff3299f5e818d8169a356323213707417d747dba 100644
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/reshape_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
diff --git a/test/operators/test_resize_op.cpp b/test/operators/test_resize_op.cpp
index f4dcaa6885d92a727e8c97d5106c3b6913a4ab33..c452ef8d850f97f6988688c4e47d5041220cb828 100644
--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/resize_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 739c594ad7044025eaa3637d8669c43f1c6c6348..df93da1529ae1e03561643ebeef4cb821f10d211 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/executor.h"
+#include "framework/executor.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index a0184729a8bc5e6b0ba952923eecd5242cfe36d4..f31bcb4e455a6b9699cf96271310681e51d4c6a7 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/softmax_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp
index e51d1cff5e99c5d9c444db046e78eee6a03f9243..9cabf1212525a7d4d6f36c45f81cba438694843d 100644
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/sum_op.h"
@@ -105,7 +103,7 @@ template class TestSumOp<CPU>;
 int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run Sum Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_eng) + "/model",
                              std::string(g_eng) + "/params");
 
diff --git a/test/operators/test_transpose2_op.cpp b/test/operators/test_transpose2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5da0faaf119c553e2fb019de76bb40f875f9d673
--- /dev/null
+++ b/test/operators/test_transpose2_op.cpp
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/transpose2_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestTranspose2Op {
+ public:
+  explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "transpose2") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " inputs size: " << op->GetInputs().size();
+          VariableNameMap inputs = op->GetInputs();
+          for (VariableNameMap::iterator it = inputs.begin();
+               it != inputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          VariableNameMap outputs = op->GetOutputs();
+          for (VariableNameMap::iterator it = outputs.begin();
+               it != outputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          input_var_name = op->Input("X")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
+              std::make_shared<operators::Transpose2Op<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 2, 8});
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t, 0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestTranspose2Op<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Transpose2 Test";
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+
+  paddle_mobile::framework::TestTranspose2Op<paddle_mobile::CPU>
+      testTranspose2Op(program);
+
+  auto output = testTranspose2Op.predict(input);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp
index f83ee23c25d8f2588e0fe40d5fabc6114129b995..263fdcfa0ed448b126f4b9cb01ace889318eeddb 100644
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "../test_include.h"
 #include "operators/transpose_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
   if (program.originProgram == nullptr) {
     DLOG << "program read file";
diff --git a/test/test_helper.h b/test/test_helper.h
index 41d6faed5229be8944178ea62786477ceadd6416..0eb11efd19b7d937f93eec14e163c8c42cb77f12 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -36,6 +36,7 @@ static const char *g_squeezenet = "../models/squeezenet";
 static const char *g_googlenet = "../models/googlenet";
 static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
 static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_mobilenet_mul = "../models/mobilenet_mul";
 static const char *g_alexnet = "../models/alexnet";
 static const char *g_inceptionv4 = "../models/inceptionv4";
 static const char *g_nlp = "../models/nlp";
@@ -44,8 +45,8 @@ static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
 static const char *g_yolo_combined = "../models/yolo_combined";
+static const char *g_yolo_mul = "../models/yolo_mul";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
-
 static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
 static const char *g_test_image_1x3x224x224_banana =
@@ -57,9 +58,12 @@ static const char *g_moto = "../images/moto_300x300_float";
 static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";
+static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
+static const char *g_mobilenet_img = "../images/image";
 
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
+using namespace paddle_mobile;
 
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl.h b/third_party/opencl/OpenCL-Headers/CL/cl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7224ed38faad33d8ed9c25acaeee26400c716aa6
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl.h
@@ -0,0 +1,1783 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_version.h>
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_version.h>
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_device_svm_capabilities;
+#endif
+typedef cl_bitfield         cl_command_queue_properties;
+#ifdef CL_VERSION_1_2
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+#endif
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_queue_properties;
+#endif
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_svm_mem_flags;
+#endif
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+#ifdef CL_VERSION_1_2
+typedef cl_bitfield         cl_mem_migration_flags;
+#endif
+typedef cl_uint             cl_image_info;
+#ifdef CL_VERSION_1_1
+typedef cl_uint             cl_buffer_create_type;
+#endif
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+#ifdef CL_VERSION_2_0
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+#endif
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_program_binary_type;
+#endif
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+#endif
+typedef cl_uint             cl_kernel_work_group_info;
+#ifdef CL_VERSION_2_1
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+#endif
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+#ifdef CL_VERSION_1_2
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
+} cl_image_desc;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+#endif
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#ifdef CL_VERSION_1_1
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#endif
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#ifdef CL_VERSION_1_1
+#define CL_INVALID_PROPERTY                         -64
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+#endif
+
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#ifdef CL_VERSION_1_2
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+#endif
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#ifdef CL_VERSION_2_1
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#endif
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#endif
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#endif
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#ifdef CL_VERSION_1_1
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#endif
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#ifdef CL_VERSION_1_1
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+#endif
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+#endif
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#ifdef CL_VERSION_1_1
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#endif
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#ifdef CL_VERSION_1_2
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+#endif
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_SIZE                               0x1094
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#endif
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#ifdef CL_VERSION_1_2
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+#endif
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#ifdef CL_VERSION_1_1
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+#endif
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#ifdef CL_VERSION_1_2
+#define CL_UNORM_INT24                              0x10DF
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_UNORM_INT_101010_2                       0x10E0
+#endif
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#ifdef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+#endif
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#ifdef CL_VERSION_1_1
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#endif
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#ifdef CL_VERSION_1_2
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+
+#endif
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#ifdef CL_VERSION_1_1
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+#endif
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#ifdef CL_VERSION_2_0
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#endif
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#ifdef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#endif
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_PROGRAM_IL                               0x1169
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+#endif
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+#endif
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#ifdef CL_VERSION_2_0
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+#endif
+
+#endif
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+#endif
+
+#ifdef CL_VERSION_2_1
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+#endif
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#ifdef CL_VERSION_1_1
+#define CL_EVENT_CONTEXT                            0x11D4
+#endif
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#ifdef CL_VERSION_1_1
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#endif
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#ifdef CL_VERSION_1_1
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+#endif
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#ifdef CL_VERSION_2_0
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+#endif
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   /* platform */,
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */,
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */,
+               cl_uint          /* num_entries */,
+               cl_device_id *   /* devices */,
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           /* context */,
+                               cl_device_id         /* device */,
+                               cl_command_queue     /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    /* device */,
+                        cl_ulong*       /* device_timestamp */,
+                        cl_ulong*       /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id /* device */,
+               cl_ulong *   /* host_timestamp */)  CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */,
+                 cl_context_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */,
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */,
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* SVM Allocation APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+/* Sampler APIs */
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    /* context */,
+                     const void*    /* il */,
+                     size_t         /* length */,
+                     cl_int*        /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */,
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */,
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */,
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          /* program */,
+                            void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                            void *              /* user_data */) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  /* program */,
+                                   cl_uint     /* spec_id */,
+                                   size_t      /* spec_size */,
+                                   const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     /* source_kernel */,
+              cl_int*       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   /* kernel */,
+                        cl_device_id                /* device */,
+                        cl_kernel_sub_group_info    /* param_name */,
+                        size_t                      /* input_value_size */,
+                        const void*                 /*input_value */,
+                        size_t                      /* param_value_size */,
+                        void*                       /* param_value */,
+                        size_t*                     /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */,
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */,
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
+                     cl_mem             /* buffer */,
+                     cl_bool            /* blocking_write */,
+                     size_t             /* offset */,
+                     size_t             /* size */,
+                     const void *       /* ptr */,
+                     cl_uint            /* num_events_in_wait_list */,
+                     const cl_event *   /* event_wait_list */,
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */,
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */,
+                    const void *       /* pattern */,
+                    size_t             /* pattern_size */,
+                    size_t             /* offset */,
+                    size_t             /* size */,
+                    cl_uint            /* num_events_in_wait_list */,
+                    const cl_event *   /* event_wait_list */,
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */,
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */,
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */,
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */,
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */,
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */,
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */,
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */,
+                   const void *       /* fill_color */,
+                   const size_t *     /* origin[3] */,
+                   const size_t *     /* region[3] */,
+                   cl_uint            /* num_events_in_wait_list */,
+                   const cl_event *   /* event_wait_list */,
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */,
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */,
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */,
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */,
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */,
+                  cl_bool           /* blocking_map */,
+                  cl_map_flags      /* map_flags */,
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+                      void (CL_CALLBACK * /*user_func*/)(void *),
+                      void *            /* args */,
+                      size_t            /* cb_args */,
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_VERSION_2_0
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+
+#endif
+
+#ifdef CL_VERSION_2_1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         /* command_queue */,
+                       cl_uint                  /* num_svm_pointers */,
+                       const void **            /* svm_pointers */,
+                       const size_t *           /* sizes */,
+                       cl_mem_migration_flags   /* flags */,
+                       cl_uint                  /* num_events_in_wait_list */,
+                       const cl_event *         /* event_wait_list */,
+                       cl_event *               /* event */) CL_API_SUFFIX__VERSION_2_1;
+
+#endif
+
+#ifdef CL_VERSION_1_2
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+#endif
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
+                              cl_command_queue_properties   /* properties */,
+                              cl_bool                        /* enable */,
+                              cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */,
+                size_t                  /* image_row_pitch */,
+                size_t                  /* image_slice_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5960a43f72123bdd693da50d3ad9a3a82cd032c
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
new file mode 100644
index 0000000000000000000000000000000000000000..39f9072398a29ab0c5a91f3a08b8c75034e8ac17
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
@@ -0,0 +1,131 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000000000000000000000000000000000000..2729e8b9e89a10dc410863140a904ee67250950d
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,132 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..331bab97c74050724573be927774523fb24101df
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
@@ -0,0 +1,182 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_dx9_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+
+#define cl_intel_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_egl.h b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a765bd5266c02fc2fd2892f0257b228996d73c5f
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..af3ce461f3a48e7707caca966e704dfe5eb58e30
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
@@ -0,0 +1,723 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+
+#if CL_TARGET_OPENCL_VERSION <= 110
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */,
+                                            const void * /* private_info */,
+                                            size_t       /* cb */,
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+
+extern CL_API_ENTRY cl_program
+  CL_API_CALL clCreateProgramWithILKHR(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_program
+  (CL_API_CALL *clCreateProgramWithILKHR_fn)(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_bitfield cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
+                                       cl_device_id /* device */,
+                                       const cl_queue_properties_khr* /* properties */,
+                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
+                                                         cl_device_id /* device */,
+                                                         const cl_queue_properties_khr* /* properties */,
+                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                        const cl_device_partition_property_ext * /* properties */,
+                        cl_uint /*num_entries*/,
+                        cl_device_id * /*out_devices*/,
+                        cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                            const cl_device_partition_property_ext * /* properties */,
+                                            cl_uint /*num_entries*/,
+                                            cl_device_id * /*out_devices*/,
+                                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+
+typedef cl_bitfield cl_mem_migration_flags_ext;
+
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */,
+                              cl_uint /* num_mem_objects */,
+                              const cl_mem * /* mem_objects */,
+                              cl_mem_migration_flags_ext /* flags */,
+                              cl_uint /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event * /* event */ );
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */,
+                                                cl_uint /* num_mem_objects */,
+                                                const cl_mem * /* mem_objects */,
+                                                cl_mem_migration_flags_ext /* flags */,
+                                                cl_uint /* num_events_in_wait_list */,
+                                                const cl_event * /* event_wait_list */,
+                                                cl_event * /* event */ );
+
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+                           cl_device_id /*in_device*/,
+                           cl_kernel_sub_group_info /* param_name */,
+                           size_t /*input_value_size*/,
+                           const void * /*input_value*/,
+                           size_t /*param_value_size*/,
+                           void* /*param_value*/,
+                           size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+                              cl_device_id /*in_device*/,
+                              cl_kernel_sub_group_info /* param_name */,
+                              size_t /*input_value_size*/,
+                              const void * /*input_value*/,
+                              size_t /*param_value_size*/,
+                              void* /*param_value*/,
+                              size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c358cfc10c5c01fa5b5bfcc65d4e5904f830a9e
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2017 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_ext_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <OpenCL/cl_platform.h>
+#else
+    #include <CL/cl.h>
+    #include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+
+#define cl_intel_thread_local_exec 1
+
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+
+#define cl_intel_device_partition_by_names 1
+
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+
+#define cl_intel_simultaneous_sharing 1
+
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+
+#define cl_intel_egl_image_yuv 1
+
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+
+#define cl_intel_packed_yuv 1
+
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+
+#define cl_intel_required_subgroup_size 1
+
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+
+#define cl_intel_driver_diagnostics 1
+
+typedef cl_uint cl_diagnostics_verbose_level;
+
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+
+#define CL_NV12_INTEL                                       0x410E
+
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
+
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_INTEL_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
new file mode 100644
index 0000000000000000000000000000000000000000..58b6449f9b4e98d561ee9a6f8b3daa6caede9f44
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
@@ -0,0 +1,175 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#ifdef CL_VERSION_1_2
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+#endif
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#ifdef CL_VERSION_1_2
+#define CL_GL_NUM_SAMPLES                       0x2012
+#endif
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef CL_VERSION_1_2
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+#endif
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3c14c6408c44160103bcb4c0dcd230a674643a5
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
@@ -0,0 +1,74 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_platform.h b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2f408fed59fc42f9c2573061704610498890b40
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
@@ -0,0 +1,1460 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_version.h>
+
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl_version.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #endif
+    #elif defined(_WIN32)
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated)
+        #endif
+
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated)
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..28444288573219be06fa449bb50161a20e95acfc
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
@@ -0,0 +1,172 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File Name: cl_va_api_media_sharing_intel.h
+
+Abstract:
+
+Notes:
+
+\*****************************************************************************/
+
+
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <va/va.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+
+#define cl_intel_va_api_media_sharing 1
+
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+	
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
+
diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_version.h b/third_party/opencl/OpenCL-Headers/CL/cl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb766cb9bbddca65a3cd599375a24cb827789d08
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_version.h
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ * Copyright (c) 2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+
+
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#endif  /* __CL_VERSION_H */
diff --git a/third_party/opencl/OpenCL-Headers/CL/opencl.h b/third_party/opencl/OpenCL-Headers/CL/opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9855cd75e7da064e094658b660851997c38a8c56
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/CL/opencl.h
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/third_party/opencl/OpenCL-Headers/LICENSE b/third_party/opencl/OpenCL-Headers/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..020ce65fcac2a60e44dab1626fa4924dec17ea23
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2008-2015 The Khronos Group Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and/or associated documentation files (the
+"Materials"), to deal in the Materials without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Materials, and to
+permit persons to whom the Materials are furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Materials.
+
+MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+   https://www.khronos.org/registry/
+
+THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
diff --git a/third_party/opencl/OpenCL-Headers/README.md b/third_party/opencl/OpenCL-Headers/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..757e56e152f8bc2fed68d2cdf38164c3171f929d
--- /dev/null
+++ b/third_party/opencl/OpenCL-Headers/README.md
@@ -0,0 +1,50 @@
+# OpenCL<sup>TM</sup> API Headers
+
+This repository contains C language headers for the OpenCL API.
+
+The authoritative public repository for these headers is located at:
+
+https://github.com/KhronosGroup/OpenCL-Headers
+
+Issues, proposed fixes for issues, and other suggested changes should be
+created using Github.
+
+## Branch Structure
+
+The OpenCL API headers in this repository are Unified headers and are designed
+to work with all released OpenCL versions.  This differs from previous OpenCL
+API headers, where version-specific API headers either existed in separate
+branches, or in separate folders in a branch.
+
+## Compiling for a Specific OpenCL Version
+
+By default, the OpenCL API headers in this repository are for the latest
+OpenCL version (currently OpenCL 2.2).  To use these API headers to target
+a different OpenCL version, an application may `#define` the preprocessor
+value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
+The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
+the OpenCL API version.
+
+For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
+include the OpenCL API headers as follows:
+
+```
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/opencl.h>
+```
+
+## Directory Structure
+
+```
+README.md               This file
+LICENSE                 Source license for the OpenCL API headers
+CL/                     Unified OpenCL API headers tree
+```
+
+## License
+
+See [LICENSE](LICENSE).
+
+---
+
+OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh
index fae1a856123bd16cf3f7a115f61b3e4473ff58a3..68cbc6cf858ed9fbf7f1fd2522cd897309e31f78 100644
--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
@@ -5,12 +5,12 @@ MODELS_PATH="../../test/models/*"
 MODELS_SRC="../../test/models"
 IMAGE_PATH="../../test/images/*"
 EXE_FILE="../../test/build/*"
-EXE_DIR="data/local/tmp/bin"
+EXE_DIR="/data/local/tmp/bin"
 adb shell mkdir ${EXE_DIR}
-MODELS_DIR="data/local/tmp/models"
+MODELS_DIR="/data/local/tmp/models"
 adb shell mkdir ${MODELS_DIR}
 for file in `ls ${MODELS_SRC}`
-do 
+do
     adb shell mkdir ${MODELS_DIR}"/"${file}
 done
 
@@ -19,11 +19,15 @@ ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
 adb push ${ACL_BUILD_PATH} ${EXE_DIR}
 fi
 
-IMAGES_DIR="data/local/tmp/images"
+IMAGES_DIR="/data/local/tmp/images"
 adb shell mkdir ${IMAGES_DIR}
 LIB_PATH="../../build/release/arm-v7a/build/*"
 adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
+for file in ${LIB_PATH}
+do
+    adb push ${file} ${EXE_DIR}
+done
+
 if [[ $1 != "npm" ]]; then
 adb push ${IMAGE_PATH} ${IMAGES_DIR}
 adb push ${MODELS_PATH} ${MODELS_DIR}
diff --git a/tools/build.sh b/tools/build.sh
index 330bc208ef2c5e27b7ad113e9a202948a144829c..65d6f58fbfbcff37d9a3325e62a70241fc54aed9 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -92,6 +92,8 @@ build_for_android() {
     fi
     cd "../build/release/${PLATFORM}"
     make -j 8
+    mkdir ./build/cl_kernel
+    cp ../../../src/operators/kernel/cl/cl_kernel/*  ./build/cl_kernel/
 }
 
 
diff --git a/tools/op.cmake b/tools/op.cmake
index f7a6ed4b134f78ddb23487cd3a861f244e6a86db..ae1ac1a4ffd4a5a563c8a7be0b90c9f26a6b0f70 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -106,9 +106,9 @@ if (CON GREATER -1)
   set(FOUND_MATCH ON)
 endif()
 
-list(FIND NET "FPGAnets" CON)
+list(FIND NET "FPGA_NET_V1" CON)
 if (CON GREATER -1)
-  message("FPGAnets enabled")
+  message("FPGA_NET_V1 enabled")
   set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDBN_OP ON)
@@ -124,6 +124,23 @@ if (CON GREATER -1)
   set(FOUND_MATCH ON)
 endif()
 
+list(FIND NET "FPGA_NET_V2" CON)
+if (CON GREATER -1)
+  message("FPGA_NET_V2 enabled")
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(SOFTMAX_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBN_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_DECONVRELU_OP ON)
+  set(SLICE_OP ON)
+  set(TANH_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FOUND_MATCH ON)
+endif()
+
 list(FIND NET "nlp" CON)
 if (CON GREATER -1)
   message("nlp enabled")
@@ -201,9 +218,11 @@ if(NOT FOUND_MATCH)
   set(PRIORBOX_OP ON)
   set(RELU_OP ON)
   set(RESHAPE_OP ON)
+  set(RESHAPE2_OP ON)
   set(SIGMOID_OP ON)
   set(SOFTMAX_OP ON)
   set(TRANSPOSE_OP ON)
+  set(TRANSPOSE2_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDADDPRELU_OP ON)
   set(FUSION_DWCONVBNRELU_OP ON)
@@ -246,9 +265,11 @@ endif()
   # option(PRIORBOX_OP "" ON)
   # option(RELU_OP "" ON)
   # option(RESHAPE_OP "" ON)
+  # option(RESHAPE2_OP "" ON)
   # option(SIGMOID_OP "" ON)
   # option(SOFTMAX_OP "" ON)
   # option(TRANSPOSE_OP "" ON)
+  # option(TRANSPOSE2_OP "" ON)
 # endif ()
 
 if (BATCHNORM_OP)
@@ -314,6 +335,9 @@ endif()
 if (RESHAPE_OP)
   add_definitions(-DRESHAPE_OP)
 endif()
+if (RESHAPE2_OP)
+  add_definitions(-DRESHAPE2_OP)
+endif()
 if (SIGMOID_OP)
   add_definitions(-DSIGMOID_OP)
 endif()
@@ -323,6 +347,9 @@ endif()
 if (TRANSPOSE_OP)
   add_definitions(-DTRANSPOSE_OP)
 endif()
+if (TRANSPOSE2_OP)
+  add_definitions(-DTRANSPOSE2_OP)
+endif()
 if (FUSION_CONVADDBNRELU_OP)
   add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
@@ -420,3 +447,9 @@ if (DEQUANT_OP)
   add_definitions(-DDEQUANT_OP)
 endif()
 
+if (TANH_OP)
+  add_definitions(-DTANH_OP)
+endif()
+if (FUSION_DECONVRELU_OP)
+  add_definitions(-DFUSION_DECONVRELU_OP)
+endif()
\ No newline at end of file
diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook
index ece9ebc598e3fa63d1d76409dc0068854aaec851..92377d2dd6b53c69aaff41e4ea204b80fef31671 100644
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -17,7 +17,7 @@ shift
 perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
 (
 # remove clang format ios_io folder
-flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
 clang-format -i $flist
 )
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"