merge opencl

7b95cd02 · liuruilong · cf51610e · f6220fe4 · 7b95cd02 · 7b95cd02
312 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
-project(paddle-mobile)
-# select the platform to build
-option(CPU "armv7 with neon support" ON)
-option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(DEBUGING "enable debug mode" ON)
-option(USE_EXCEPTION "use std exception" OFF)
+option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" OFF)
+# select the platform to build
+option(CPU "armv7 with neon" ON)
+option(GPU_MALI "mali gpu" OFF)
+option(GPU_CL "opencl gpu" ON)
+option(FPGA "fpga" OFF)
+project(paddle-mobile)
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -70,7 +71,27 @@ else()
    endforeach()
 endif()
-if(MALI_GPU)
+if (GPU_CL)
+    add_definitions(-DPADDLE_MOBILE_CL)
+    # opencl version
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
+    link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
+    include_directories(third_party/opencl/OpenCL-Headers)
+else()
+    file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
+if (GPU_MALI)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
    add_definitions(-DUSE_ACL=1)
    add_definitions(-DUSE_OPENCL)
@@ -124,17 +145,17 @@ endif()
 if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif()
 if(IS_IOS)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
-endif()
+endif ()
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

--- a/src/common/common.h
+++ b/src/common/common.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <chrono>
+#include <chrono>  // NOLINT
+namespace paddle_mobile {
 using Time = decltype(std::chrono::high_resolution_clock::now());
@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
  ms counter = std::chrono::duration_cast<ms>(diff);
  return counter.count() / 1000.0;
 }
+}  // namespace paddle_mobile
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception {
    std::string detail(buffer);                                            \
    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
                                               __FILE__, __LINE__);        \
-  }
+  }                                                                        \
+  exit(0);
 #define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
  {                                                                           \

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -39,7 +39,13 @@ struct PrecisionTrait<Precision::FP16> {
 };
 //! device type
-enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+enum DeviceTypeEnum {
+  kINVALID = -1,
+  kCPU = 0,
+  kFPGA = 1,
+  kGPU_MALI = 2,
+  kGPU_CL = 3
+};
 template <DeviceTypeEnum T>
 struct DeviceType {};
@@ -47,6 +53,7 @@ struct DeviceType {};
 typedef DeviceType<kCPU> CPU;
 typedef DeviceType<kFPGA> FPGA;
 typedef DeviceType<kGPU_MALI> GPU_MALI;
+typedef DeviceType<kGPU_CL> GPU_CL;
 //! data type
 enum DataType {

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -117,9 +117,9 @@ class Attribute {
  template <typename Vistor>
  static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
-    if (attr.variant_.TypeId() == typeid(int).hash_code()) {
+    if (attr.variant_.TypeId() == typeid(int).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<int>());
-    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<float>());
    } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
      return vistor(attr.variant_.GetString());
@@ -129,7 +129,7 @@ class Attribute {
      return vistor(attr.variant_.Get<vector<float>>());
    } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
      return vistor(attr.variant_.Get<vector<string>>());
-    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<bool>());
    } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
      return vistor(attr.variant_.Get<vector<bool>>());
@@ -137,7 +137,6 @@ class Attribute {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-      exit(0);
    }
  }

--- a/src/framework/cl/cl_deleter.h
+++ b/src/framework/cl/cl_deleter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "CL/cl.h"
+struct CLKernelDeleter {
+  template <class T>
+  void operator()(T *clKernelObj) {
+    clReleaseKernel(clKernelObj);
+  }
+};
+struct CLMemDeleter {
+  template <class T>
+  void operator()(T *clMemObj) {
+    clReleaseMemObject(clMemObj);
+  }
+};
+struct CLEventDeleter {
+  template <class T>
+  void operator()(T *clEventObj) {
+    clReleaseEvent(clEventObj);
+  }
+};
+struct CLCommQueueDeleter {
+  template <class T>
+  void operator()(T *clQueueObj) {
+    clReleaseCommandQueue(clQueueObj);
+  }
+};
+struct CLContextDeleter {
+  template <class T>
+  void operator()(T *clContextObj) {
+    clReleaseContext(clContextObj);
+  }
+};
+struct CLProgramDeleter {
+  template <class T>
+  void operator()(T *clProgramObj) {
+    clReleaseProgram(clProgramObj);
+  }
+};
--- a/src/framework/cl/cl_engine.cpp
+++ b/src/framework/cl/cl_engine.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/cl/cl_engine.h"
+#include "CL/cl.h"
+#include "framework/cl/cl_tool.h"
+#include <cstdlib>
+#include <cstring>
+namespace paddle_mobile {
+namespace framework {
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  cl_int status;
+  SetPlatform();
+  SetClDeviceId();
+  initialized_ = true;
+  return initialized_;
+  //  setClCommandQueue();
+  //  std::string filename = "./HelloWorld_Kernel.cl";
+  //  loadKernelFromFile(filename.c_str());
+  //  buildProgram();
+}
+CLEngine *CLEngine::Instance() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+bool CLEngine::SetPlatform() {
+  platform_ = NULL;      // the chosen platform
+  cl_uint numPlatforms;  // the NO. of platforms
+  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+  /**For clarity, choose the first available platform. */
+  if (numPlatforms > 0) {
+    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
+        malloc(numPlatforms * sizeof(cl_platform_id)));
+    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    platform_ = platforms[0];
+    free(platforms);
+    return true;
+  } else {
+    return false;
+  }
+}
+bool CLEngine::SetClDeviceId() {
+  cl_uint numDevices = 0;
+  devices_ = NULL;
+  cl_int status =
+      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+  if (numDevices > 0) {
+    devices_ = reinterpret_cast<cl_device_id *>(
+        malloc(numDevices * sizeof(cl_device_id)));
+    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
+                            NULL);
+    return true;
+  }
+  return false;
+}
+// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
+//    const std::string &kernel_name) {
+//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
+//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
+//  return std::move(kernel);
+//}
+//
+// bool CLEngine::SetClCommandQueue() {
+//  cl_int status;
+//  command_queue_.reset(
+//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
+//  return true;
+//}
+// bool CLEngine::SetClContext() {
+//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
+//  return true;
+//}
+// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
+//  size_t size;
+//  char *str;
+//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
+//
+//  if (!f.is_open()) {
+//    return false;
+//  }
+//
+//  size_t fileSize;
+//  f.seekg(0, std::fstream::end);
+//  size = fileSize = (size_t)f.tellg();
+//  f.seekg(0, std::fstream::beg);
+//  str = new char[size + 1];
+//  if (!str) {
+//    f.close();
+//    return 0;
+//  }
+//
+//  f.read(str, fileSize);
+//  f.close();
+//  str[size] = '\0';
+//  const char *source = str;
+//  size_t sourceSize[] = {strlen(source)};
+//  program_.reset(
+//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
+//      NULL));
+//  return true;
+//}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include "CL/cl.h"
+#include "common/enforce.h"
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_tool.h"
+namespace paddle_mobile {
+namespace framework {
+class CLEngine {
+ public:
+  static CLEngine *Instance();
+  bool Init();
+  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
+    cl_int status;
+    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
+    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
+    CL_CHECK_ERRORS(status);
+    return std::move(context_ptr);
+  }
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
+      cl_context context) {
+    cl_int status;
+    cl_command_queue queue =
+        clCreateCommandQueue(context, devices_[0], 0, &status);
+    std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
+        queue);
+    CL_CHECK_ERRORS(status);
+    return std::move(command_queue_ptr);
+  }
+  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
+      cl_context context, std::string file_name) {
+    FILE *file = fopen(file_name.c_str(), "rb");
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          file_name.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    char *data = new char[size + 1];
+    size_t bytes_read = fread(data, 1, size, file);
+    data[size] = '\0';
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    const char *source = data;
+    size_t sourceSize[] = {strlen(source)};
+    cl_program p =
+        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
+    DLOG << " cl kernel file name: " << file_name;
+    DLOG << " source size: " << sourceSize[0];
+    CL_CHECK_ERRORS(status_);
+    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
+    return std::move(program_ptr);
+  }
+  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
+    cl_event event = clCreateUserEvent(context, &status_);
+    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
+    CL_CHECK_ERRORS(status_);
+    return std::move(event_ptr);
+  }
+  bool BuildProgram(cl_program program) {
+    cl_int status;
+    status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel",
+                            0, 0);
+    CL_CHECK_ERRORS(status);
+    if (status_ == CL_BUILD_PROGRAM_FAILURE) {
+      size_t log_size;
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+      char *log = reinterpret_cast<char *>(malloc(log_size));
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+      DLOG << " program build error: " << log;
+    }
+    if (status == CL_SUCCESS) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
+ private:
+  CLEngine() { initialized_ = false; }
+  bool SetPlatform();
+  bool SetClDeviceId();
+  bool initialized_;
+  cl_platform_id platform_;
+  cl_device_id *devices_;
+  cl_int status_;
+  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
+  //  bool SetClContext();
+  //  bool SetClCommandQueue();
+  //  bool LoadKernelFromFile(const char *kernel_file);
+  //  bool BuildProgram();
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_half.cpp
+++ b/src/framework/cl/cl_half.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+#include "framework/cl/cl_half.h"
+namespace paddle_mobile {
+namespace framework {
+static const uint32_t mantissatable[2048] = {
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+    0x387fc000, 0x387fe000};
+static const uint16_t offsettable[64] = {
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+static const uint32_t exponenttable[64] = {
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
+static const uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
+    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
+    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
+    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
+    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
+    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
+    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
+    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+static const uint8_t shifttable[512] = {
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+half_t Float2Half(float f) {
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
+  return basetable[(v >> 23) & 0x1ff] +
+         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
+}
+float Half2Float(half_t h) {
+  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
+               exponenttable[h >> 10];
+  return *reinterpret_cast<float *>(&v);
+}
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = Float2Half(f_array[i]);
+  }
+}
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = Half2Float(h_array[i]);
+  }
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_half.h
+++ b/src/framework/cl/cl_half.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstdint>
+namespace paddle_mobile {
+namespace framework {
+typedef uint16_t half_t;
+half_t Float2Half(float f);
+float Half2Float(half_t h);
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_image.h"
+#include "framework/cl/cl_scope.h"
+namespace paddle_mobile {
+namespace framework {
+class CLHelper {
+ public:
+  CLHelper() = default;
+  explicit CLHelper(CLScope *scope) : scope_(scope) {}
+  void AddKernel(const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " begin add kernel ";
+    auto kernel = scope_->GetKernel(kernel_name, file_name);
+    DLOG << " add kernel ing ";
+    kernels.emplace_back(std::move(kernel));
+  }
+  cl_kernel KernelAt(const int index) {
+    DLOG << " kernel count: " << kernels.size();
+    return kernels[index].get();
+  }
+  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
+  cl_context CLContext() { return scope_->Context(); }
+  std::vector<size_t> DefaultWorkSize(const CLImage &image) {
+    // n c h w
+    auto image_dim = image.dims();
+    if (image_dim.size() == 4) {
+      auto n = image_dim[0];
+      auto h = image_dim[2];
+      auto w = image_dim[3];
+      auto image_width = image.ImageWidth();
+      auto work_size_0 = image_width / w;
+      auto work_size_1 = w;
+      auto work_size_2 = n * h;
+      return {work_size_0, work_size_1, work_size_2};
+    } else if (image_dim.size() == 2) {
+      return {1, image.ImageWidth(), image.ImageHeight()};
+    } else if (image_dim.size() == 1) {
+      return {1, image.ImageWidth(), 1};
+    }
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
+  }
+ private:
+  CLScope *scope_;
+  std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/cl/cl_image.h"
+namespace paddle_mobile {
+namespace framework {
+void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &cl_image) {
+  int width = cl_image.ImageDims()[0];
+  int height = cl_image.ImageDims()[1];
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+  CL_CHECK_ERRORS(err);
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+  printer << " dims: " << cl_image.dims() << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    printer << tensor_data[i] << " ";
+  }
+  delete[](tensor_data);
+  delete[](image_data);
+  return printer;
+}
+#endif
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_half.h"
+#include "framework/cl/cl_image_converter.h"
+#include "framework/cl/cl_tool.h"
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace framework {
+class CLImage {
+ public:
+  CLImage() = default;
+  ~CLImage() {
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+    }
+    if (image_converter_) {
+      delete (image_converter_);
+    }
+  }
+  /*
+   * will not hold input tensor data, memcpy in this method
+   * */
+  void SetTensorData(float *tensorData, const DDim &dim) {
+    int numel = product(dim);
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+      tensor_data_ = nullptr;
+    }
+    tensor_data_ = new float[numel];
+    memcpy(tensor_data_, tensorData, numel * sizeof(float));
+    tensor_dims_ = dim;
+  }
+  /*
+   * need call SetTensorData first
+   *
+   * folder when one dim or two dim
+   * */
+  void InitCLImage(cl_context context, cl_command_queue command_queue) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    InitCLImage(context, command_queue, folder_converter);
+  }
+  void InitCLImage(cl_context context, cl_command_queue command_queue,
+                   CLImageConverterBase *converter) {
+    if (image_converter_ != nullptr) {
+      delete (image_converter_);
+    }
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+    DLOG << " begin init cl image ";
+    image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+    half_t *image_data = new half_t[product(image_dims_) * 4];
+    DLOG << " convert to image";
+    converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
+    DLOG << " end convert to image";
+    InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+    delete[](image_data);
+    delete[](tensor_data_);
+    command_queue_ = command_queue;
+    tensor_data_ = nullptr;
+    image_converter_ = converter;
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+  void InitNImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
+    InitCLImage(context, command_queue, folder_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+  void InitDWImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
+    InitCLImage(context, command_queue, dw_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
+                      const DDim &dim) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
+                          " empty image tensor data shouldn't have value");
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    DLOG << " to get image dims ";
+    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    DLOG << " end get image dims " << image_dims_;
+    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+    tensor_dims_ = dim;
+    command_queue_ = command_queue;
+    image_converter_ = folder_converter;
+    cl_event_ = CLEngine::Instance()->CreateEvent(context);
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+  cl_mem GetCLImage() const { return cl_image_.get(); }
+  const DDim &ImageDims() const { return image_dims_; }
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+  inline cl_command_queue CommandQueue() const { return command_queue_; }
+  /*
+   *  resize original tensor dim
+   * */
+  inline CLImage &Resize(const DDim &dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+  template <typename T>
+  T *data() const {
+    if (initialized_) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " cl image has initialized, tensor data has been deleted, can't use "
+          "tensor data");
+    }
+    return reinterpret_cast<T *>(tensor_data_);
+  }
+  /*
+   *  numel of tensor dim
+   * */
+  inline int64_t numel() const { return product(tensor_dims_); }
+  /*
+   *  original tensor dim
+   * */
+  const DDim &dims() const { return tensor_dims_; }
+  cl_event GetClEvent() const { return cl_event_.get(); }
+  CLImageConverterBase *Converter() const { return image_converter_; }
+ private:
+  void InitCLImage(cl_context context, int width, int height, void *data) {
+    cl_image_format cf = {.image_channel_order = CL_RGBA,
+                          .image_channel_data_type = CL_HALF_FLOAT};
+    cl_image_desc cid = {
+        .image_type = CL_MEM_OBJECT_IMAGE2D,
+        .image_width = width,
+        .image_height = height,
+        .image_depth = 1,
+        .image_array_size = 1,
+        .image_row_pitch = 0,
+        .image_slice_pitch = 0,
+        .num_mip_levels = 0,
+        .num_samples = 0,
+        // .buffer = nullptr
+    };
+    cid.buffer = nullptr;
+    cl_int err;
+    cl_mem cl_image = clCreateImage(
+        context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+        &cf,   // const cl_image_format *image_format
+        &cid,  // const cl_image_desc *image_desc
+        data,  // void *host_ptr
+        &err);
+    cl_image_.reset(cl_image);
+    if (err != CL_SUCCESS) {
+      CL_CHECK_ERRORS(err);
+      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
+    }
+  }
+  bool initialized_ = false;
+  std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
+  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
+  DDim tensor_dims_;
+  DDim image_dims_;
+  float *tensor_data_ = nullptr;
+  cl_context context_;
+  cl_command_queue command_queue_;
+  CLImageConverterBase *image_converter_ = nullptr;
+};
+void TensorToCLImage(Tensor *tensor, CLImage *image,
+                     cl_command_queue commandQueue);
+void CLImageToTensor(CLImage *image, Tensor *tensor,
+                     cl_command_queue commandQueue);
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &image);
+#endif
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/cl/cl_image_converter.h"
+namespace paddle_mobile {
+namespace framework {
+const DDim &CLImageConverterDefault::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return make_ddim({width, height});
+}
+void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+  int w_block = width / W;
+  float *p = nchw;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  int width = image_dim[0];
+  int height = image_dim[0];
+  float *p = tensor;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+const DDim &CLImageConverterFolder::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    int width = (tdim[1] + 3) / 4;
+    int height = tdim[0];
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+    return make_ddim({width, height});
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (int j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+    return make_ddim({width, height});
+  }
+}
+void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+  } else {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    int width = image_dim[0];
+    for (int h = 0; h < tdim[0]; h++) {
+      for (int w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
+      }
+    }
+  }
+}
+void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+  } else {
+    int width = image_dim[0];
+    int height = image_dim[1];
+    int H, W;
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      H = 1;
+      W = tensor_dim[0];
+    }
+    float *p = tensor;
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+      }
+    }
+  }
+}
+const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+  for (int n = 0; n < block * 4; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          if (n < N) {
+            image[index] = Float2Half(*p);
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          *p = Half2Float(image[index]);
+          p++;
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+const DDim &CLImageConverterDWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+  int w_block = width / W;
+  float *p = tensor;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[1];
+  int C = tensor_dim[0];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[0];
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/cl/cl_half.h"
+#include "framework/ddim.h"
+namespace paddle_mobile {
+namespace framework {
+class CLImageConverterBase {
+ public:
+  virtual void NCHWToImage(float *nchw, half_t *image,
+                           const DDim &tensor_dim) = 0;
+  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+  int GetCBlock() const { return c_block_; }
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_tool.h"
+namespace paddle_mobile {
+namespace framework {
+class CLScope {
+ public:
+  CLScope() {
+    CLEngine *engin = CLEngine::Instance();
+    context_ = engin->CreateContext();
+    command_queue_ = engin->CreateClCommandQueue(context_.get());
+  }
+  cl_command_queue CommandQueue() { return command_queue_.get(); }
+  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
+      const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " to get program " << file_name;
+    auto program = Program(file_name);
+    DLOG << " end get program ~ ";
+    DLOG << " to create kernel: " << kernel_name;
+    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
+        clCreateKernel(program, kernel_name.c_str(), &status_));
+    CL_CHECK_ERRORS(status_);
+    DLOG << " end create kernel ~ ";
+    return std::move(kernel);
+  }
+  cl_context Context() { return context_.get(); }
+  cl_program Program(const std::string &file_name) {
+    auto it = programs_.find(file_name);
+    if (it != programs_.end()) {
+      return it->second.get();
+    }
+    auto program = CLEngine::Instance()->CreateProgramWith(
+        context_.get(), "./cl_kernel/" + file_name);
+    DLOG << " --- begin build program -> " << file_name << " --- ";
+    CLEngine::Instance()->BuildProgram(program.get());
+    DLOG << " --- end build program -> " << file_name << " --- ";
+    programs_[file_name] = std::move(program);
+    return programs_[file_name].get();
+  }
+ private:
+  cl_int status_;
+  std::unique_ptr<_cl_context, CLContextDeleter> context_;
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  std::unordered_map<std::string,
+                     std::unique_ptr<_cl_program, CLProgramDeleter>>
+      programs_;
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/tensor_base.h"
+namespace paddle_mobile {
+namespace framework {
+class CLTensor : TensorBase {
+ public:
+  CLTensor(cl_context context, cl_command_queue command_queue)
+      : context_(context), command_queue_(command_queue) {}
+  CLTensor() = default;
+  /*
+   * if init method haven't set context and command_queue, need set
+   * */
+  void SetContextAndCommandQueue(cl_context context,
+                                 cl_command_queue command_queue) {
+    context_ = context;
+    command_queue_ = command_queue;
+  }
+  /*! Resize the dimensions of the memory block. */
+  inline CLTensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    return *this;
+  }
+  template <typename T>
+  inline cl_mem mutable_with_data(const T *data) {
+    int64_t size = numel() * sizeof(T);
+    holder_.reset(new PlaceholderImpl(
+        size, reinterpret_cast<void *>(const_cast<T *>(data)), typeid(T),
+        context_, command_queue_));
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+  inline cl_mem mutable_data(std::type_index type) {
+    if (holder_ != nullptr) {
+      holder_->set_type(type);
+    }
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+    int64_t size = numel() * SizeOfType(type);
+    if (holder_ == nullptr || holder_->size() < size + offset_) {
+      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
+      offset_ = 0;
+    }
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+  /**
+   * @brief   Return a pointer to cl buffer.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data() {
+    return reinterpret_cast<cl_mem>(mutable_data(typeid(T)));
+  }
+  /**
+   * @brief     Return a pointer to cl buffer.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data(DDim dims) {
+    Resize(dims);
+    return mutable_data<T>();
+  }
+  inline cl_mem CLBuffer() {
+    check_memory_size();
+    return reinterpret_cast<cl_mem>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()));
+  }
+  template <typename T>
+  inline T *Data() {
+    if (host_ptr_) {
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+    cl_mem buffer = CLBuffer();
+    host_ptr_ = new char[holder_->size()];
+    cl_int status;
+    status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
+                                 holder_->size(), host_ptr_, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+    return reinterpret_cast<T *>(host_ptr_);
+  }
+  int memorySize() { return holder_->size(); }
+  ~CLTensor() {
+    DLOG << "~CLTensor";
+    if (host_ptr_) {
+      DLOG << " delete host ptr ";
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+  }
+ private:
+  cl_context context_;
+  cl_command_queue command_queue_;
+  void *host_ptr_ = nullptr;
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(size_t size, void *input, std::type_index type,
+                    cl_context context, cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                              size, reinterpret_cast<void *>(input), NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+    PlaceholderImpl(size_t size, std::type_index type, cl_context context,
+                    cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+    virtual size_t size() const { return size_; }
+    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
+    std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
+    size_t size_;
+    /* the current type of memory */
+    std::type_index type_;
+    cl_command_queue command_queue_;
+  };
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tool.cpp
+++ b/src/framework/cl/cl_tool.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/cl/cl_tool.h"
+namespace paddle_mobile {
+namespace framework {
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tool.h
+++ b/src/framework/cl/cl_tool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "CL/cl.h"
+namespace paddle_mobile {
+namespace framework {
+const char* opencl_error_to_str(cl_int error);
+#define CL_CHECK_ERRORS(ERR)                                          \
+  if (ERR != CL_SUCCESS) {                                            \
+    printf(                                                           \
+        "OpenCL error with code %s happened in file %s at line %d. "  \
+        "Exiting.\n",                                                 \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
+        __LINE__);                                                    \
+  }
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
    return DataLayout::kAnyLayout;
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-    exit(0);
  }
 }
@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
      return "ANY_LAYOUT";
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      exit(0);
      break;
  }
 }

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -42,7 +42,7 @@ struct Dim {
      : head(idx % size.head), tail(idx / size.head, size.tail) {}
  /** Construct a Dim with each dimension set to the given index */
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  explicit Dim(int64_t idx) : head(idx), tail(idx) {}
  bool operator==(const Dim<i> &o) const {
    return (head == o.head) && (tail == o.tail);
@@ -65,7 +65,7 @@ template <>
 struct Dim<0> {
  static constexpr int dimensions = 0;
-  Dim(int64_t _head) {}
+  explicit Dim(int64_t _head) {}
  Dim() {}
@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }
 template <int D>
@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }
 }  // namespace

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "io/executor.h"
+#include "framework/executor.h"
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -26,11 +26,24 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#include "operators/math/gemm.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif
 namespace paddle_mobile {
+namespace framework {
 using framework::Variable;
+using framework::Variable;
+#pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
@@ -390,15 +403,92 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  Executor<Dtype, P>::Ptype *output_ptr =
+  if (output_tensor != nullptr) {
-      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+    Executor<Dtype, P>::Ptype *output_ptr =
-  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-  for (int j = 0; j < output_tensor->numel(); ++j) {
+    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-    result_vector.push_back(output_ptr[j]);
+    for (int j = 0; j < output_tensor->numel(); ++j) {
+      result_vector.push_back(output_ptr[j]);
+    }
+    return result_vector;
+  } else {
+    DLOG << "return  empty vector";
+    return {};
  }
-  return result_vector;
 }
+#ifdef PADDLE_MOBILE_FPGA
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                        string var_name) {
+  framework::Variable *g_feed_value = program_.scope->Var(var_name);
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+}
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+  InjectVariable(t, "feed");
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range");
+  auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id];
+  auto output_map = last_op->Outputs();
+  std::vector<std::string> out_keys = last_op->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output");
+  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
+      out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+}
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From_To(int start, int end) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  end = end < 0 ? static_cast<int>(ops.size()) : end;
+  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
+                        "start or end parameter is wrong");
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+  for (int i = start; i < end; i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
+    ops[i]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+}
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From(int start) {
+  Predict_From_To(start);
+}
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_To(int end) {
+  Predict_From_To(0, end);
+}
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
@@ -470,8 +560,232 @@ void Executor<Dtype, P>::Predict_To(int end) {
 }
 #endif
+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+                                    float *tensorInput, char **data) {}
+template <>
+void Executor<GPU_CL, Precision::FP32>::LoadMemory(
+    const framework::VarDesc var_desc, float *tensorInput, char **data) {
+  // 1. version
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
+  // 2 Lod information
+  uint64_t *lod_level_ptr = new uint64_t();
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  (*data) += sizeof(uint64_t);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
+    std::vector<size_t> tmp(size / sizeof(size_t));
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
+    }
+  }
+  // 3. tensor version
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
+  // 4. tensor desc
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
+  std::unique_ptr<char[]> buf(new char[size]);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = (*data)[m];
+  }
+  (*data) += (sizeof(char) * size);
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+  void *memory = nullptr;
+  //            int type_size = 0;
+  //            switch (desc.DataType()) {
+  //                case framework::VARTYPE_TYPE_FP16:
+  //                    type_size = 2;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP32:
+  //                    type_size = 4;
+  //                    memory = tensor->mutable_data<float>();
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT32:
+  //                    memory = tensor->mutable_data<int32_t>();
+  //                    type_size = 4;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_BOOL:
+  //                    type_size = 1;
+  //                    break;
+  //                default:
+  //                    break;
+  //            }
+  int type_size = 4;
+  memory = tensorInput;
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size; n++) {
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
+  }
+}
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitMemory() {
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+        char *origin_data =
+            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        int numel = 1;
+        for (auto l : desc.Dims()) {
+          numel *= l;
+        }
+        DLOG << var_desc->Name();
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &data);
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+        delete origin_data;
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto cl_image = var->template GetMutable<framework::CLImage>();
+          cl_context context = program_.scope->GetCLScpoe()->Context();
+          cl_command_queue command_queue =
+              program_.scope->GetCLScpoe()->CommandQueue();
+          const framework::TensorDesc &desc = var_desc->Tensor_desc();
+          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
+          framework::DDim ddim = cl_image->dims();
+          DLOG << var_desc->Name();
+          cl_image->InitEmptyImage(context, command_queue, ddim);
+        }
+      }
+    }
+  }
+}
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
+  char *origin_data;
+  if (program_.combined_params_buf && program_.combined_params_len) {
+    LOG(kLOG_INFO) << "use outter memory";
+    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
+  } else {
+    LOG(kLOG_INFO) << " begin init combine memory";
+    origin_data = ReadFileToBuff(program_.para_path);
+  }
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  float *data = reinterpret_cast<float *>(origin_data);
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        int numel = 1;
+        for (int i = 0; i < ddim.size(); i++) {
+          numel = numel * ddim[i];
+        }
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &origin_data);
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        auto cl_image = var->template GetMutable<framework::CLImage>();
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        cl_command_queue command_queue =
+            program_.scope->GetCLScpoe()->CommandQueue();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = cl_image->dims();
+        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        cl_image->InitEmptyImage(context, command_queue, ddim);
+      }
+    }
+  }
+  delete origin_data;
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
+#endif
 template class Executor<CPU, Precision::FP32>;
-template class Executor<GPU_MALI, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
+template class Executor<GPU_CL, Precision::FP32>;
+template class Executor<GPU_MALI, Precision::FP32>;
+}  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "framework/tensor.h"
 namespace paddle_mobile {
+namespace framework {
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
@@ -79,7 +80,10 @@ class Executor {
  void LoadMemory(void **data,
                  const std::shared_ptr<framework::VarDesc> var_desc,
                  framework::LoDTensor *tensor);
+#ifdef PADDLE_MOBILE_CL
+  void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
+                  char **data);
+#endif
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
@@ -97,4 +101,5 @@ class Executor {
  bool loddable_ = false;
 };
+}  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "io/loader.h"
+#include "framework/loader.h"
 #include "framework/lod_tensor.h"
 #include "framework/program/program-optimize/program_optimize.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif
 namespace paddle_mobile {
-using framework::Variable;
+namespace framework {
 /**
 * muteandresize tensor as originProgramDesc and scope in loadParams
@@ -26,23 +29,24 @@ using framework::Variable;
 * @param originProgramDesc
 * @param scope
 */
-void InitMemoryFromProgram(
+template <typename Dtype, Precision P>
-    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,  // NOLINT
+void Loader<Dtype, P>::InitMemoryFromProgram(
-    std::shared_ptr<framework::Scope> &scope) {                  // NOLINT
+    const std::shared_ptr<ProgramDesc> &originProgramDesc,
+    const std::shared_ptr<Scope> &scope) {
  for (const auto &block : originProgramDesc.get()->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
        if (var_desc->Persistable()) {
          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
+          auto tensor = var->GetMutable<LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
+          tensor->Resize(make_ddim(dim));
        } else {
          auto dim = var_desc->Tensor_desc().Dims();
          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
+          auto tensor = var->GetMutable<LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
+          tensor->Resize(make_ddim(dim));
        }
      } else {
        // TODO(codeWorm): some.
@@ -50,6 +54,36 @@ void InitMemoryFromProgram(
    }
  }
 }
+#ifdef PADDLE_MOBILE_CL
+template <>
+void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
+    const std::shared_ptr<ProgramDesc> &originProgramDesc,
+    const std::shared_ptr<Scope> &scope) {
+  for (const auto &block : originProgramDesc.get()->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = scope.get()->Var(var_desc->Name());
+      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable()) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          //              auto tensor = var->GetMutable<LoDTensor>();
+          auto cl_image = var->GetMutable<framework::CLImage>();
+          cl_image->Resize(make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto cl_image = var->GetMutable<framework::CLImage>();
+          cl_image->Resize(make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+}
+#endif
 /**
 * fusion and print someinfos
 * @tparam Dtype
@@ -61,19 +95,18 @@ void InitMemoryFromProgram(
 */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split,
+    bool optimize, bool can_add_split, Program<Dtype, P> *program,
-    framework::Program<Dtype, P> &program,  // NOLINT
+    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
-    const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
  if (optimize) {
-    framework::ProgramOptimize program_optimize;
+    ProgramOptimize program_optimize;
-    program.optimizeProgram =
+    program->optimizeProgram =
        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program.optimizeProgram) {
+    if (!program->optimizeProgram) {
-      program.optimizeProgram = originProgramDesc;
+      program->optimizeProgram = originProgramDesc;
    }
  }
  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
+    program->optimizeProgram->Description("optimize: ");
  } else {
    originProgramDesc->Description("program: ");
  }
@@ -102,9 +135,10 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 }
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
-    const std::string &dirname, bool optimize, bool quantification,
+                                               bool optimize,
-    bool can_add_split) {
+                                               bool quantification,
+                                               bool can_add_split) {
  auto program = this->LoadProgram(dirname + "/__model__", optimize,
                                   quantification, can_add_split);
  program.model_path = dirname;
@@ -112,9 +146,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
-    const std::string &model_path, const std::string &para_path, bool optimize,
+                                               const std::string &para_path,
-    bool quantification) {
+                                               bool optimize,
+                                               bool quantification) {
  auto program = this->LoadProgram(model_path, optimize, quantification);
  program.para_path = para_path;
@@ -124,7 +159,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
    const std::string &model_path, bool optimize, bool quantification,
    bool can_add_split) {
  std::string model_filename = model_path;
@@ -141,29 +176,29 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  //
  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-  framework::Program<Dtype, P> program;
+  Program<Dtype, P> program;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
  program.combined_params_len = 0;
  program.combined_params_buf = nullptr;
-  auto scope = std::make_shared<framework::Scope>();
+  auto scope = std::make_shared<Scope>();
  program.scope = scope;
  // use  originProgramDesc and scope to init tensors
  InitMemoryFromProgram(originProgramDesc, scope);
  // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
  return program;
 }
 template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
+const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    const uint8_t *combined_params_buf, bool optimize, bool quantification) {
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
@@ -177,26 +212,31 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-  framework::Program<Dtype, P> program;
+  Program<Dtype, P> program;
  program.combined = true;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
  program.combined_params_len = combined_params_len;
  program.combined_params_buf = combined_params_buf;
-  auto scope = std::make_shared<framework::Scope>();
+  auto scope = std::make_shared<Scope>();
  program.scope = scope;
  InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
                                                               nullptr);
  return program;
 }
 template class Loader<CPU, Precision::FP32>;
 template class Loader<FPGA, Precision::FP32>;
 template class Loader<GPU_MALI, Precision::FP32>;
+template class Loader<GPU_CL, Precision::FP32>;
+}  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/loader.h
+++ b/src/framework/loader.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "common/types.h"
+#include "framework/program/program.h"
+namespace paddle_mobile {
+namespace framework {
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const Program<Dtype, P> Load(const std::string &dirname,
+                               bool optimize = false,
+                               bool quantification = false,
+                               bool can_add_split = false);
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const Program<Dtype, P> Load(const std::string &model_path,
+                               const std::string &para_path,
+                               bool optimize = false,
+                               bool quantification = false);
+  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
+                                             const uint8_t *model_buf,
+                                             size_t combined_params_len,
+                                             uint8_t *combined_params_buf,
+                                             bool optimize = false,
+                                             bool quantification = false);
+ private:
+  const Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                      bool optimize = false,
+                                      bool quantification = false,
+                                      bool can_add_split = false);
+  void InitMemoryFromProgram(
+      const std::shared_ptr<ProgramDesc> &originProgramDesc,
+      const std::shared_ptr<Scope> &scope);
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -14,8 +14,10 @@ limitations under the License. */
 #pragma once
+#include <memory>
 #include <string>
 #include <tuple>
 #include "common/log.h"
 #include "common/type_define.h"
 #include "framework/op_info.h"
@@ -120,5 +122,8 @@ class OpRegistry {
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
+#define REGISTER_OPERATOR_CL(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -56,37 +56,69 @@ template <typename Dtype>
 void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 template <typename Dtype>
-void OperatorBase<Dtype>::Run() const {
+void OperatorBase<Dtype>::Run() {
+  DLOG << " ----- Begin run impl --- " << type_ << " ----- ";
  RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
+  DLOG << " ----- End run impl --- " << type_ << " ----- ";
-  DLOG << "-------------" << type_ << "----------------------------";
+  //#ifdef PADDLE_MOBILE_DEBUG
-  vector<string> input_keys = GetInputKeys();
+  //  DLOG << "-------------" << type_ << "----------------------------";
-  for (const auto key : input_keys) {
+  //  vector<string> input_keys = GetInputKeys();
-    auto var_vec_in = inputs_.at(key);
+  //  for (const auto key : input_keys) {
-    for (int i = 0; i < var_vec_in.size(); ++i) {
+  //    auto var_vec_in = inputs_.at(key);
-      auto vari = scope_->FindVar(var_vec_in[i]);
+  //    for (int i = 0; i < var_vec_in.size(); ++i) {
-      if (vari->IsInitialized()) {
+  //      auto vari = scope_->FindVar(var_vec_in[i]);
-        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+  //      if (vari->IsInitialized()) {
-        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+  //#ifdef PADDLE_MOBILE_CL
-      }
+  //        if (type_ == "feed") {
-    }
+  //          Tensor *tensor = vari->template
-  }
+  //          GetMutable<framework::LoDTensor>(); if (tensor) DLOG << type_ << "
-  for (const auto key : GetOutKeys()) {
+  //          input- " << key << "=" << *tensor;
-    auto var_vec_out = outputs_.at(key);
+  //        } else {
-    for (int i = 0; i < var_vec_out.size(); ++i) {
+  //          CLImage *cl_image = vari->template
-      auto vari = scope_->FindVar(var_vec_out[i]);
+  //          GetMutable<framework::CLImage>(); if (cl_image) {
-      if (vari->IsInitialized()) {
+  //            DLOG << type_ << " input- " << key << "=" << *cl_image;
-        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+  //          }
-        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+  //        }
-      }
+  //
-    }
+  //#else
-  }
+  //        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
-#endif
+  //        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+  //#endif
+  //      }
+  //    }
+  //  }
+  //  for (const auto key : GetOutKeys()) {
+  //    auto var_vec_out = outputs_.at(key);
+  //    for (int i = 0; i < var_vec_out.size(); ++i) {
+  //      auto vari = scope_->FindVar(var_vec_out[i]);
+  //      if (vari->IsInitialized()) {
+  //#ifdef PADDLE_MOBILE_CL
+  //        if (type_ == "fetch") {
+  //          Tensor *tensor = vari->template
+  //          GetMutable<framework::LoDTensor>(); if (tensor) {
+  //            DLOG << type_ << " output- " << key << "=" << *tensor;
+  //          }
+  //        } else {
+  //          CLImage *cl_image = vari->template
+  //          GetMutable<framework::CLImage>(); if (cl_image) {
+  //            DLOG << type_ << " output- " << key << "=" << *cl_image;
+  //          }
+  //        }
+  //
+  //#else
+  //        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+  //        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+  //#endif
+  //      }
+  //    }
+  //  }
+  //#endif
 }
 template class OperatorBase<CPU>;
 template class OperatorBase<FPGA>;
 template class OperatorBase<GPU_MALI>;
+template class OperatorBase<GPU_CL>;
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/enforce.h"
@@ -31,7 +32,10 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_scope.h"
+#endif
 namespace paddle_mobile {
 namespace framework {
 using std::string;
@@ -59,10 +63,10 @@ class OperatorBase {
               const VariableNameMap &outputs, const AttributeMap &attrs,
               std::shared_ptr<Scope> scope);
  virtual ~OperatorBase() {}
-  void Run() const;
+  void Run();
  std::vector<string> GetOutKeys() const;
  std::vector<string> GetInputKeys() const;
-  virtual void RunImpl() const = 0;
+  virtual void RunImpl() = 0;
  virtual void Init() = 0;
  /*
@@ -112,9 +116,13 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     std::shared_ptr<Scope> scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
+        param_(inputs, outputs, attrs, *scope) {
+#ifdef PADDLE_MOBILE_CL
+    kernel_.InitCLHelper(scope->GetCLScpoe());
+#endif
+  }
-  virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
+  virtual void RunImpl() { this->kernel_.Compute(this->param_); }
  virtual void InferShape() const = 0;
@@ -123,6 +131,7 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
    //      DLOG << i.first;
    //      DLOG << i.second;
    //    }
    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                          this->type_.c_str());
  }
@@ -138,22 +147,35 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
 template <typename Dtype, typename P>
 class OpKernelBase {
 public:
-  /*
+  OpKernelBase() = default;
-   * @b 所有kernel 需实现 Compute 方法
-   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+#ifdef PADDLE_MOBILE_CL
-   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+  virtual void InitCLHelper(CLScope *clScope) {
-   * */
+    cl_helper_ = CLHelper(clScope);
-#ifdef PADDLE_MOBILE_MALI_GPU
+  }
+#endif
+    /*
+     * @b 所有kernel 需实现 Compute 方法
+     * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
+     *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
+     * */
+#ifdef PADDLE_McOBILE_MALI_GPU
  OpKernelBase() { acl_op_ = nullptr; }
  void *GetAclOp() const { return acl_op_; }
  void SetAclOp(void *op, void *ob) const {
    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
  }
 #endif
-  virtual void Compute(const P &para) const = 0;
+  virtual void Compute(const P &para) = 0;
  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
+ protected:
+#ifdef PADDLE_MOBILE_CL
+  CLHelper cl_helper_;
+#endif
 private:
 #ifdef PADDLE_MOBILE_MALI_GPU
  void *acl_op_;

--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"
+#include <string>
 namespace paddle_mobile {
 namespace framework {
@@ -32,7 +34,7 @@ class Program {
  bool combined = false;
  bool quantification = false;
  size_t combined_params_len;
-  const uint8_t *combined_params_buf;
+  uint8_t *combined_params_buf;
 };
 }  // namespace framework

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -15,8 +15,14 @@ limitations under the License. */
 #pragma once
 #include <list>
+#include <string>
 #include <unordered_map>
-#include "variable.h"
+#include <vector>
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_scope.h"
+#endif
+#include "framework/variable.h"
 namespace paddle_mobile {
 namespace framework {
@@ -33,6 +39,10 @@ class Scope {
      delete kid;
    }
    kids_.clear();
+#ifdef PADDLE_MOBILE_CL
+    delete cl_scope_;
+#endif
  }
  Scope &NewScope() const;
@@ -72,6 +82,10 @@ class Scope {
  Variable *FindVarLocally(const std::string &name) const;
+#ifdef PADDLE_MOBILE_CL
+  CLScope *GetCLScpoe() { return cl_scope_; }
+#endif
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const *parent) : parent_(parent) {}
@@ -79,6 +93,10 @@ class Scope {
  mutable std::unordered_map<std::string, Variable *> vars_;
  mutable std::list<Scope *> kids_;
  Scope const *parent_{nullptr};
+#ifdef PADDLE_MOBILE_CL
+  CLScope *cl_scope_ = new CLScope();
+#endif
 };
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -24,65 +24,24 @@ limitations under the License. */
 #include <vector>
 #include "common/enforce.h"
-#include "common/types.h"
 #include "framework/data_layout.h"
-#include "framework/ddim.h"
+#include "framework/tensor_base.h"
 #include "memory/t_malloc.h"
 namespace paddle_mobile {
 namespace framework {
-template <typename... T>
-struct SizeOfTypeFunctor;
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
-                    size_t>
-      functor;
-  size_t size = functor(type);
-  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
 class LoDTensor;
-class Tensor {
+class Tensor : public TensorBase {
 public:
-  Tensor() : offset_(0) {}
+  Tensor() {}
  template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) : offset_(0) {
+  Tensor(std::vector<T> input, DDim ddim) {
    PADDLE_MOBILE_ENFORCE(
        input.size() == framework::product(ddim),
        "input vector'length should be equal to tensor's length");
    auto input_ptr = mutable_data<T>(ddim);
    for (int i = 0; i < input.size(); ++i) {
      input_ptr[i] = input[i];
@@ -95,46 +54,6 @@ class Tensor {
    this->offset_ = inTensor.offset_;
  }
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s",
-        this->holder_->type().name());
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s ,requested:%s",
-        this->holder_->type().name(), typeid(T).name());
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(typeid(T)));
-  }
 #ifdef PADDLE_MOBILE_DEBUG
  template <typename T>
  inline void dump(std::string filename) const {
@@ -151,6 +70,21 @@ class Tensor {
  }
 #endif
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    return *this;
+  }
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor &ShareDataWith(const Tensor &src) {
+    src.check_memory_size();
+    if (holder_.get() != src.holder_.get()) {
+      *this = src;
+    }
+    return *this;
+  }
  inline void *mutable_data(std::type_index type) {
    if (holder_ != nullptr) {
      holder_->set_type(type);
@@ -165,6 +99,16 @@ class Tensor {
        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T *mutable_data() {
+    static_assert(std::is_pod<T>::value, "T must be POD");
+    return reinterpret_cast<T *>(mutable_data(typeid(T)));
+  }
  /**
   * @brief     Return a pointer to mutable memory block.
   *
@@ -180,27 +124,6 @@ class Tensor {
    return mutable_data<T>();
  }
-  /*! Return the dimensions of the memory block. */
-  inline const DDim &dims() const { return dims_; }
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const { return product(dims_); }
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      *this = src;
-    }
-    return *this;
-  }
  /**
   * @brief  Return a sub-tensor of the given tensor.
   *
@@ -234,44 +157,35 @@ class Tensor {
    }
  }
-  std::type_index type() const {
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T *data() {
+    check_memory_size();
    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
+        (std::is_same<T, void>::value ||
-        "Tensor not initialized yet when Tensor::type() is called.")
+         holder_->type().hash_code() == typeid(T).hash_code()),
-    return holder_->type();
+        "Tensor holds the wrong type, it holds %s",
-  }
+        this->holder_->type().name());
-  // memory size returns the holding memory size in byte.
+    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-  size_t memory_size() const {
+                                 offset_);
-    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
  }
-  inline void check_memory_size() const {
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T *data() const {
+    check_memory_size();
    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
+        (std::is_same<T, void>::value ||
-        "Tensor holds no memory. Call Tensor::mutable_data first.");
+         holder_->type().hash_code() == typeid(T).hash_code()),
-    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
+        "Tensor holds the wrong type, it holds %s ,requested:%s",
-                          "Tensor's dims_ is out of bound. ");
+        this->holder_->type().name(), typeid(T).name());
+    return reinterpret_cast<const T *>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }
 private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a
-   * template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-    virtual void *ptr() const = 0;
-    virtual size_t size() const = 0;
-    virtual std::type_index type() const = 0;
-    virtual void set_type(std::type_index type) = 0;
-  };
  struct PlaceholderImpl : public Placeholder {
    PlaceholderImpl(size_t size, std::type_index type)
        : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
@@ -299,27 +213,6 @@ class Tensor {
    std::type_index type_;
  };
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-  DDim dims_;
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really
-   * begins.
-   */
-  size_t offset_;
 #ifdef PADDLE_MOBILE_FPGA
 public:  // NOLINT
  inline void reset_data_ptr(void *p) {

--- a/src/framework/tensor_base.h
+++ b/src/framework/tensor_base.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <type_traits>
+#include <typeindex>
+#include "common/enforce.h"
+#include "common/types.h"
+#include "framework/ddim.h"
+namespace paddle_mobile {
+namespace framework {
+template <typename... T>
+struct SizeOfTypeFunctor;
+template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
+      functor;
+  size_t size = functor(type);
+  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+class TensorBase {
+ public:
+  virtual inline TensorBase &Resize(const DDim &dims) = 0;
+  inline bool IsInitialized() const { return holder_ != nullptr; }
+  /*! Return the dimensions of the memory block. */
+  inline const DDim &dims() const { return dims_; }
+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const { return product(dims_); }
+  std::type_index type() const {
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor not initialized yet when Tensor::type() is called.")
+    return holder_->type();
+  }
+  // memory size returns the holding memory size in byte.
+  size_t memory_size() const {
+    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+  }
+  inline void check_memory_size() const {
+    PADDLE_MOBILE_ENFORCE(
+        holder_ != nullptr,
+        "Tensor holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
+                          "Tensor's dims_ is out of bound. ");
+  }
+ protected:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a
+   * template
+   *          parameter of Variable.
+   */
+  struct Placeholder {
+    virtual ~Placeholder() = default;
+    virtual void *ptr() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
+    virtual void set_type(std::type_index type) = 0;
+  };
+  /**
+   * @brief points to elements dimensions.
+   *
+   * @note dims_ do not indicate the memory block size.
+   */
+  DDim dims_;
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really
+   * begins.
+   */
+  size_t offset_ = 0;
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -126,6 +126,8 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
+      x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
    } else {
      LOG(kLOG_ERROR) << "unsupport device type!";
      return nullptr;

--- a/src/ios_io/PaddleMobileCPU.h
+++ b/src/ios_io/PaddleMobileCPU.h
--- a/src/ios_io/PaddleMobileCPU.mm
+++ b/src/ios_io/PaddleMobileCPU.mm
--- a/src/jni/PML.java
+++ b/src/jni/PML.java
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -44,7 +44,7 @@ class PaddleBuf {
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
  // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
  // Resize to `length` bytes.
  void Resize(size_t length);
@@ -121,7 +121,7 @@ struct PaddleModelMemoryPack {
 struct PaddleMobileConfig : public PaddlePredictor::Config {
  enum Precision { FP32 = 0 };
-  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 };
  enum Precision precision;
  enum Device device;

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -28,13 +28,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
                                  bool quantification, int batch_size,
                                  bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }
  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
        loddable);
  } else {
@@ -50,13 +50,13 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                  bool quantification, int batch_size,
                                  bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }
  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
        loader_->Load(model_path, para_path, optimize, quantification),
        batch_size, optimize, loddable);
  } else {
@@ -67,21 +67,22 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
 }
 template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
+bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
+                                                const uint8_t *model_buf,
-    const uint8_t *combined_params_buf) {
+                                                size_t combined_params_len,
+                                                uint8_t *combined_params_buf) {
  int batch_size = 1;
  bool optimise = true;
  bool quantification = false;
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }
  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                    combined_params_buf, optimise,
                                    quantification),
@@ -161,4 +162,6 @@ template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
+template class PaddleMobile<GPU_CL, Precision::FP32>;
 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #endif  // _OPENMP
 #include "common/types.h"
+#include "framework/executor.h"
 #include "framework/load_ops.h"
+#include "framework/loader.h"
 #include "framework/tensor.h"
-#include "io/executor.h"
-#include "io/loader.h"
 namespace paddle_mobile {
@@ -52,7 +52,7 @@ class PaddleMobile {
  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
-                          const uint8_t *combined_params_buf);
+                          uint8_t *combined_params_buf);
  void SetThreadNum(int num);
  void Clear();
@@ -69,8 +69,8 @@ class PaddleMobile {
 #endif
 private:
-  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
-  std::shared_ptr<Executor<Dtype, P>> executor_;
+  std::shared_ptr<framework::Executor<Dtype, P>> executor_;
 };
 }  // namespace paddle_mobile
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef BATCHNORM_OP
-#include "batchnorm_op.h"
+#include "operators/batchnorm_op.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
@@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #ifdef PADDLE_MOBILE_FPGA
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp);
+#endif
 #endif
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -40,10 +40,6 @@ class BilinearOp : public framework::OperatorWithKernel<
            DeviceType, BilinearInterpParam<DeviceType>,
            operators::BilinearInterpKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, BilinearInterpParam<DeviceType>,
-      operators::BilinearInterpKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -39,10 +39,6 @@ class BoxCoderOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                      operators::BoxCoderKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, BoxCoderParam<DeviceType>,
-      operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -34,10 +34,6 @@ class ConcatOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                      operators::ConcatKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ConcatParam<DeviceType>,
-      operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -62,4 +62,8 @@ REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(conv2d, ops::ConvOp);
+#endif
 #endif
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -34,10 +34,6 @@ class ConvOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ConvParam<DeviceType>,
-      operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 private:

--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -37,10 +37,6 @@ class CrfOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
                                      operators::CrfKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, CrfParam<DeviceType>,
-      operators::CrfKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -36,10 +36,6 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
            DeviceType, ConvParam<DeviceType>,
            operators::DepthwiseConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ConvParam<DeviceType>,
-      operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 private:

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -38,10 +38,6 @@ class DropoutOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                      operators::DropoutKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  // using framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
-  //                                    operators::DropoutKernel<DeviceType,
-  //                                    T>>;
  void InferShape() const override;
 protected:

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-#include "elementwise_add_op.h"
+#include "operators/elementwise_add_op.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,4 +36,8 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
+#endif
 #endif
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -37,10 +37,6 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
            DeviceType, ElementwiseAddParam<DeviceType>,
            operators::ElementwiseAddKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseAddParam<DeviceType>,
-      operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,6 +14,19 @@ limitations under the License. */
 #include "operators/feed_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void FeedOp<DeviceType, T>::InferShape() const {
+  auto out_dims = this->param_.Out()->dims();
+  out_dims[0] = this->param_.BatchSize();
+  this->param_.Out()->Resize(out_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
@@ -25,3 +38,6 @@ REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(feed, ops::FeedOp);
+#endif
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -16,68 +16,29 @@ limitations under the License. */
 #include <string>
 #include "framework/operator.h"
+#include "operators/kernel/feed_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+using std::string;
 template <typename DeviceType, typename T>
-class FeedOp : public framework::OperatorBase<DeviceType> {
+class FeedOp
+    : public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
+                                           FeedKernel<DeviceType, T>> {
 public:
  FeedOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope.get()) {}
-  void InferShape() const {
-    auto out_dims = param_.Out()->dims();
-    out_dims[0] = param_.BatchSize();
-    param_.Out()->Resize(out_dims);
-  }
-#ifdef PADDLE_MOBILE_FPGA
-  void Init() {
-    Tensor *output = param_.Out();
-    fpga::format_fp16_ofm(output);
-  }
-  void RunImpl() const {
-    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());  // NOLINT
-    fpga::format_image(input);
-    auto input_ptr = input->data<float>();
-    Tensor *output = param_.Out();
-    auto output_ptr = output->data<float>();
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = (void *)input_ptr;  // NOLINT
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-  }
-#else
+      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
-  void Init() {}
+                                      FeedKernel<DeviceType, T>>(
-  void RunImpl() const {
+            type, inputs, outputs, attrs, scope) {}
-    param_.Out()->ShareDataWith(*param_.InputX());
+  void InferShape() const override;
-    param_.Out()->set_lod(param_.InputX()->lod());
-  }
-#endif
 protected:
-  FeedParam<DeviceType> param_;
 };
 }  // namespace operators

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/fetch_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void FetchOp<DeviceType, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
@@ -24,3 +35,6 @@ REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fetch, ops::FetchOp);
+#endif
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include "framework/operator.h"
+#include "operators/kernel/fetch_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -23,25 +24,20 @@ namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class FetchOp : public framework::OperatorBase<DeviceType> {
+class FetchOp
+    : public framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
+                                           FetchKernel<DeviceType, T>> {
 public:
  FetchOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
-                                            scope),
+                                      FetchKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
-  void Init() {}
+  void InferShape() const override;
-  void InferShape() const {
-    auto x_dims = param_.InputX()->dims();
-    param_.Out()->Resize(x_dims);
-  }
 protected:
-  FetchParam<DeviceType> param_;
 };
 }  // namespace operators

--- a/src/operators/fill_constant_op.cpp
+++ b/src/operators/fill_constant_op.cpp
@@ -20,9 +20,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
 #endif

--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -37,7 +37,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
        param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const {
+  void RunImpl() {
    auto data_type =
        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
            param_.DataDtype());

--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "operators/kernel/flatten_kernel.h"
@@ -53,10 +54,6 @@ class FlattenOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
                                      operators::FlattenKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FlattenParam<DeviceType>,
-      operators::FlattenKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include <utility>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/conv_add_add_prelu_kernel.h"
@@ -67,10 +68,6 @@ class FusionConvAddAddPReluOp
            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
            operators::ConvAddAddPReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-      operators::ConvAddAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -58,5 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
+#endif
 #endif
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
@@ -66,10 +66,6 @@ class FusionConvAddBNReluOp
            DeviceType, FusionConvAddBNReluParam<DeviceType>,
            operators::ConvAddBNReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddBNReluParam<DeviceType>,
-      operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -58,4 +58,8 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
+#endif
 #endif
--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
+#include "operators/op_param.h"
 #include "operators/kernel/conv_add_kernel.h"
 namespace paddle_mobile {
@@ -56,10 +56,6 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
                                      FusionConvAddParam<DeviceType>,
                                      operators::ConvAddKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddParam<DeviceType>,
-      operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -40,9 +40,7 @@ class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
                 },
                 removed_nodes);
  }
  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
@@ -63,9 +61,6 @@ class FusionConvAddPReluOp
            operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddPReluParam<DeviceType>,
-      operators::ConvAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
 #endif
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -29,9 +29,8 @@ namespace operators {
 class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
  FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }
  void FolderNodes(
@@ -57,9 +56,6 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                         attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddReluParam<DeviceType>,
-      operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include <utility>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
+#include "operators/op_param.h"
 #include "operators/kernel/conv_bn_add_relu_kernel.h"
 namespace paddle_mobile {
@@ -71,10 +72,6 @@ class FusionConvBNAddReluOp
            DeviceType, FusionConvBNAddReluParam<DeviceType>,
            operators::ConvBNAddReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvBNAddReluParam<DeviceType>,
-      operators::ConvBNAddReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -63,10 +63,6 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
            DeviceType, FusionConvBNReluParam<DeviceType>,
            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                        attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionConvBNReluParam<DeviceType>,
-      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
+#include "operators/op_param.h"
 #include "operators/kernel/dwconv_bn_relu_kernel.h"
 namespace paddle_mobile {
@@ -65,9 +65,6 @@ class FusionDWConvBNReluOp
            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionDWConvBNReluParam<DeviceType>,
-      operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -56,10 +56,6 @@ class FusionFcOp : public framework::OperatorWithKernel<
                                      operators::FusionFcKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionFcParam<DeviceType>,
-      operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -56,9 +56,6 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, FusionFcReluParam<DeviceType>,
-      operators::FusionFcReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -37,10 +37,6 @@ class GruOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
                                      operators::GruKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, GruParam<DeviceType>,
-      operators::GruKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -16,15 +16,14 @@ limitations under the License. */
 #pragma once
-#include <operators/op_param.h>
+#include <string>
+#include "operators/op_param.h"
 #include "framework/operator.h"
 #include "operators/kernel/im2sequence_kernel.h"
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename DeviceType, typename T>
 class Im2SequenceOp : public framework::OperatorWithKernel<
                          DeviceType, Im2SequenceParam<DeviceType>,
@@ -39,9 +38,6 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
                                                         attrs, scope) {}
-  // using framework::OperatorWithKernel<
-  //    DeviceType, Im2SequenceParam<DeviceType>,
-  //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 private:

--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -26,8 +26,7 @@ bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
 }
 template <>
-void BatchNormKernel<CPU, float>::Compute(
+void BatchNormKernel<CPU, float>::Compute(const BatchNormParam<CPU> &param) {
-    const BatchNormParam<CPU> &param) const {
  BatchnormCompute<float>(param);
 }

--- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
@@ -27,7 +27,7 @@ bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
 template <>
 void BilinearInterpKernel<CPU, float>::Compute(
-    const BilinearInterpParam<CPU> &param) const {
+    const BilinearInterpParam<CPU> &param) {
  BilinearInterpCompute<float>(param);
 }

--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -26,8 +26,7 @@ bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
 }
 template <>
-void BoxCoderKernel<CPU, float>::Compute(
+void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam<CPU> &param) {
-    const BoxCoderParam<CPU> &param) const {
  BoxCoderCompute<float>(param);
 }

--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -26,7 +26,7 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
 }
 template <>
-void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) const {
+void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
  ConcatCompute<float>(param);
  param.Out()->set_lod(param.Inputs()[0]->lod());
 }

--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
@@ -28,7 +28,7 @@ bool ConvAddAddPReluKernel<CPU, float>::Init(
 template <>
 void ConvAddAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddAddPReluParam<CPU> &param) const {
+    const FusionConvAddAddPReluParam<CPU> &param) {
  ConvAddAddPReluCompute<float>(param);
 }
 template class ConvAddAddPReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -55,7 +55,7 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
 template <>
 void ConvAddBNReluKernel<CPU, float>::Compute(
-    const FusionConvAddBNReluParam<CPU> &param) const {
+    const FusionConvAddBNReluParam<CPU> &param) {
  ConvAddBNReluCompute<float>(param);
 }
 template class ConvAddBNReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -25,8 +25,7 @@ bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
 }
 template <>
-void ConvAddKernel<CPU, float>::Compute(
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
-    const FusionConvAddParam<CPU> &param) const {
  ConvAddCompute<float>(param);
 }

--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
 template <>
 void ConvAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddPReluParam<CPU> &param) const {
+    const FusionConvAddPReluParam<CPU> &param) {
  ConvAddPReluCompute<float>(param);
 }
 template class ConvAddPReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam<CPU> &param) const {
+    const FusionConvAddReluParam<CPU> &param) {
  ConvAddReluCompute<float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
@@ -55,7 +55,7 @@ bool ConvBNAddReluKernel<CPU, float>::Init(
 template <>
 void ConvBNAddReluKernel<CPU, float>::Compute(
-    const FusionConvBNAddReluParam<CPU> &param) const {
+    const FusionConvBNAddReluParam<CPU> &param) {
  ConvBNAddReluCompute<float>(param);
 }
 template class ConvBNAddReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -57,7 +57,7 @@ bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
 template <>
 void ConvBNReluKernel<CPU, float>::Compute(
-    const FusionConvBNReluParam<CPU> &param) const {
+    const FusionConvBNReluParam<CPU> &param) {
  ConvBNReluCompute<float>(param);
 }
 template class ConvBNReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -26,7 +26,7 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 }
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
+void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
  ConvCompute<float>(param);
 }

--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
 template <>
 void ConvTransposeKernel<CPU, float>::Compute(
-    const ConvTransposeParam<CPU> &param) const {
+    const ConvTransposeParam<CPU> &param) {
  ConvTransposeCompute<float>(param);
 }

--- a/src/operators/kernel/arm/crf_kernel.cpp
+++ b/src/operators/kernel/arm/crf_kernel.cpp
@@ -27,7 +27,7 @@ bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
 }
 template <>
-void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) const {
+void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) {
  CrfCompute<float>(param);
 }

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -26,8 +26,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 }
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
-    const ConvParam<CPU> &param) const {
  DepthwiseConvCompute<float>(param);
 }

--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -29,8 +29,7 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
 }
 template <>
-void DequantizeKernel<CPU, float>::Compute(
+void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
-    const DequantizeParam<CPU> &param) const {
  const Tensor *input = param.input_;
  Tensor *output = param.out_;
  float activation_scale = param.activation_scale_->data<float>()[0];

--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -27,7 +27,7 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
 template <typename T>
 struct DropoutFunctor {
-  DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
+  explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
  inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
 private:
@@ -35,7 +35,7 @@ struct DropoutFunctor {
 };
 template <>
-void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
+void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) {
  const auto *input_x = param.InputX();
  auto *input_x_ptr = input_x->data<float>();
  auto *out = param.Out();

--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
@@ -54,7 +54,7 @@ bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
 template <>
 void DWConvBNReluKernel<CPU, float>::Compute(
-    const FusionDWConvBNReluParam<CPU> &param) const {
+    const FusionDWConvBNReluParam<CPU> &param) {
  DWConvBNReluCompute<float>(param);
 }
 template class DWConvBNReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
-    const ElementwiseAddParam<CPU> &param) const {
+    const ElementwiseAddParam<CPU> &param) {
  ElementwiseAddCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

--- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
 template <>
 void ElementwiseMulKernel<CPU, float>::Compute(
-    const ElementwiseMulParam<CPU> &param) const {
+    const ElementwiseMulParam<CPU> &param) {
  ElementwiseMulCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

--- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
@@ -27,7 +27,7 @@ bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
 template <>
 void ElementwiseSubKernel<CPU, float>::Compute(
-    const ElementwiseSubParam<CPU> &param) const {
+    const ElementwiseSubParam<CPU> &param) {
  ElementwiseSubCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

--- a/src/operators/kernel/arm/feed_kernel.cpp
+++ b/src/operators/kernel/arm/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
+  return true;
+}
+template <>
+void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+  param.Out()->set_lod(param.InputX()->lod());
+}
+template class FeedKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/fetch_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
+  return true;
+}
+template <>
+void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+template class FetchKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
@@ -26,7 +26,7 @@ bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
 }
 template <>
-void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) const {
+void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) {
  FlattenCompute<float>(param);
 }

--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -26,8 +26,7 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
 }
 template <>
-void FusionFcKernel<CPU, float>::Compute(
+void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
-    const FusionFcParam<CPU> &param) const {
  FusionFcCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

--- a/src/operators/kernel/arm/gru_kernel.cpp
+++ b/src/operators/kernel/arm/gru_kernel.cpp
@@ -26,7 +26,7 @@ bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
 }
 template <>
-void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) const {
+void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
  GruCompute<float>(param);
  param.OutHidden()->set_lod(param.InputInput()->lod());
  //  DLOG << "________________" << param.OutHidden()->dims();

--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -33,7 +33,7 @@ inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
 template <>
 void Im2SequenceKernel<CPU, float>::Compute(
-    const Im2SequenceParam<CPU> &param) const {
+    const Im2SequenceParam<CPU> &param) {
  const Tensor *in_x = param.Input();
  framework::LoDTensor *out = param.Output();
  out->mutable_data<float>();
@@ -56,7 +56,7 @@ void Im2SequenceKernel<CPU, float>::Compute(
  out->mutable_data<float>({batch_size * output_height * output_width,
                            img_channels * kernels[0] * kernels[1]});
  const std::vector<int> dilations({1, 1});
-  // TODO: verify
+  // TODO(): verify
  auto out_dims = out->dims();
  out->Resize({batch_size, out->numel() / batch_size});
  for (int i = 0; i < batch_size; i++) {

--- a/src/operators/kernel/arm/lookup_kernel.cpp
+++ b/src/operators/kernel/arm/lookup_kernel.cpp
@@ -25,7 +25,7 @@ bool LookupKernel<CPU, float>::Init(LookupParam<CPU> *param) {
 }
 template <>
-void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) const {
+void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) {
  LookupCompute<float>(param);
  param.Out()->set_lod(param.InputIds()->lod());
 }

--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -26,7 +26,7 @@ bool LrnKernel<CPU, float>::Init(LrnParam<CPU> *param) {
 }
 template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) const {
+void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) {
  LrnCompute<float>(param);
 }

--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -26,7 +26,7 @@ bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
 }
 template <>
-void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
+void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) {
  MulCompute<float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -27,7 +27,7 @@ bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam<CPU> *param) {
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam<CPU> &param) const {
+    const MultiClassNMSParam<CPU> &param) {
  MultiClassNMSCompute<float>(param);
 }

--- a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
@@ -28,7 +28,7 @@ bool PolygonBoxTransformKernel<CPU, float>::Init(
 template <>
 void PolygonBoxTransformKernel<CPU, float>::Compute(
-    const PolygonBoxTransformParam<CPU> &param) const {
+    const PolygonBoxTransformParam<CPU> &param) {
  PolygonBoxTransformCompute<float>(param);
 }

--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -25,7 +25,7 @@ bool PoolKernel<CPU, float>::Init(PoolParam<CPU> *param) {
 }
 template <>
-void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) const {
+void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) {
  PoolCompute<float>(param);
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
@@ -35,7 +35,7 @@ struct PReluFunctor {
 * @b 特化到具体平台的实现, param 从 op 层传入
 * */
 template <>
-void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) const {
+void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) {
  auto *x = param.InputX();
  auto *alpha = param.InputAlpha();
  auto *out = param.Out();

--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -26,8 +26,7 @@ bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam<CPU> *param) {
 }
 template <>
-void PriorBoxKernel<CPU, float>::Compute(
+void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam<CPU> &param) {
-    const PriorBoxParam<CPU> &param) const {
  PriorBoxCompute<float>(param);
 }

--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -279,8 +279,7 @@ bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
 }
 template <>
-void QuantizeKernel<CPU, float>::Compute(
+void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
-    const QuantizeParam<CPU> &param) const {
  float max_abs = 0.f;
  const Tensor *input = param.input_;
  Tensor *output = param.out_;

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -26,7 +26,7 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
 }
 template <>
-void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) const {
+void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
  ReluCompute<float>(param);
 }

--- a/src/operators/kernel/arm/reshape2_kernel.cpp
+++ b/src/operators/kernel/arm/reshape2_kernel.cpp
@@ -26,8 +26,7 @@ bool Reshape2Kernel<CPU, float>::Init(Reshape2Param<CPU> *param) {
 }
 template <>
-void Reshape2Kernel<CPU, float>::Compute(
+void Reshape2Kernel<CPU, float>::Compute(const Reshape2Param<CPU> &param) {
-    const Reshape2Param<CPU> &param) const {
  Reshape2Compute<float>(param);
 }

--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -26,7 +26,7 @@ bool ReshapeKernel<CPU, float>::Init(ReshapeParam<CPU> *param) {
 }
 template <>
-void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) const {
+void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) {
  ReshapeCompute<float>(param);
 }

--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
@@ -108,7 +108,7 @@ void ResizeTensor(const Tensor* src, Tensor* dst) {
 }
 template <>
-void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) const {
+void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) {
  const auto* input_x = param.InputX();
  const auto& input_x_dims = input_x->dims();
  auto* out = param.Out();

--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 * @b 特化到具体平台的实现, param 从 op 层传入
 * */
 template <>
-void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) const {
+void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
  const auto *input_x = param.InputX();
  auto *input_x_ptr = input_x->data<float>();
  auto *out = param.Out();

--- a/src/operators/kernel/arm/shape_kernel.cpp
+++ b/src/operators/kernel/arm/shape_kernel.cpp
@@ -26,7 +26,7 @@ bool ShapeKernel<CPU, float>::Init(ShapeParam<CPU> *param) {
 }
 template <>
-void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) const {
+void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) {
  ShapeCompute<float>(param);
 }

--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -32,7 +32,7 @@ bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
 }
 template <>
-void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) const {
+void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) {
  SigmoidCompute<float>(param);
 }

--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -26,7 +26,7 @@ bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam<CPU> *param) {
 }
 template <>
-void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) const {
+void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) {
  SoftmaxCompute<float>(param);
 }

--- a/src/operators/kernel/arm/split_kernel.cpp
+++ b/src/operators/kernel/arm/split_kernel.cpp
@@ -26,7 +26,7 @@ bool SplitKernel<CPU, float>::Init(SplitParam<CPU> *param) {
 }
 template <>
-void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) const {
+void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) {
  SplitCompute<float>(param);
 }

--- a/src/operators/kernel/arm/sum_kernel.cpp
+++ b/src/operators/kernel/arm/sum_kernel.cpp
@@ -26,7 +26,7 @@ bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
 }
 template <>
-void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) const {
+void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) {
  SumCompute<float>(param);
  param.Out()->set_lod(param.Inputs()[0]->lod());
 }

--- a/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
@@ -25,8 +25,7 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
 }
 template <>
-void Transpose2Kernel<CPU, float>::Compute(
+void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
-    const Transpose2Param<CPU> &param) const {
  Transpose2Compute<float>(param);
 }

--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -25,8 +25,7 @@ bool TransposeKernel<CPU, float>::Init(TransposeParam<CPU> *param) {
 }
 template <>
-void TransposeKernel<CPU, float>::Compute(
+void TransposeKernel<CPU, float>::Compute(const TransposeParam<CPU> &param) {
-    const TransposeParam<CPU> &param) const {
  TransposeCompute<float>(param);
 }

--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -22,13 +22,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename DeviceType, typename T>
 class BatchNormKernel
    : public framework::OpKernelBase<DeviceType, BatchNormParam<DeviceType>> {
 public:
-  void Compute(const BatchNormParam<DeviceType> &param) const;
+  void Compute(const BatchNormParam<DeviceType> &param);
  bool Init(BatchNormParam<DeviceType> *param);
 };

--- a/src/operators/kernel/bilinear_interp_kernel.h
+++ b/src/operators/kernel/bilinear_interp_kernel.h
@@ -29,7 +29,7 @@ class BilinearInterpKernel
    : public framework::OpKernelBase<DeviceType,
                                     BilinearInterpParam<DeviceType>> {
 public:
-  void Compute(const BilinearInterpParam<DeviceType>& param) const;
+  void Compute(const BilinearInterpParam<DeviceType>& param);
  bool Init(BilinearInterpParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class BoxCoderKernel
    : public framework::OpKernelBase<DeviceType, BoxCoderParam<DeviceType>> {
 public:
-  void Compute(const BoxCoderParam<DeviceType>& param) const;
+  void Compute(const BoxCoderParam<DeviceType>& param);
  bool Init(BoxCoderParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef BATCHNORM_OP
+#include "operators/kernel/batchnorm_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  framework::CLImage *new_scale = new framework::CLImage();
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         this->cl_helper_.CLCommandQueue());
+  framework::CLImage *new_bias = new framework::CLImage();
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        this->cl_helper_.CLCommandQueue());
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+  return true;
+}
+template <>
+void BatchNormKernel<GPU_CL, float>::Compute(
+    const BatchNormParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
+  auto input = param.InputX()->GetCLImage();
+  auto out = param.OutputY()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  const int out_width = default_work_size[1];
+  clSetKernelArg(kernel, 1, sizeof(int), &out_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
+  //  cl_event out_event = param.OutputY()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}
+template class BatchNormKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void batchnorm(__private const int out_width,
+                        __read_only image2d_t input,
+                        __read_only image2d_t new_scale_image,
+                        __read_only image2d_t new_bias_image,
+                        __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0));
+  half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0));
+  int pos_x = mad24(out_c, out_width, out_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
+  half4 out = mad(in, new_scale, new_bias);
+  write_imageh(output, (int2)(pos_x, out_nh), out);
+}
--- a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     int2 coords_bias;
+     coords_bias.x = x/w;
+     coords_bias.y = 0;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords_bias);
+     half4 output = in + biase;
+     write_imageh(outputImage,coords,output);
+ }
--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+inline half4 activation(half4 in
+#ifdef PRELU
+                        ,
+                        half4 prelu_alpha
+#endif
+) {
+  half4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+#endif
+#ifdef RELU
+  output = fmax(in, (half4)(0.0f));
+#endif
+  return output;
+}
--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define BIASE
+#define BATCH_NORM
+#define RELU
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define BIASE
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define BIASE
+#define RELU
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+/*
+conv
+conv_bn
+conv_add
+conv_relu
+conv_bn_relu
+conv_add_relu
+conv_add_bn_relu
+*/
+#include "cl_common.h"
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+   half4 input[9];
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+        for (int j = 0; j < 9; ++j) {
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+        }
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    const int batch_index = out_nh / output_height;
+    const int out_nh_in_one_batch = out_nh % output_height;
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+    const int filter_width = 3;
+    const int filter_height = 3;
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+    half4 inputs[9];
+        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              half4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (half4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+    half4 filters[9];
+    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    /*
+    if (output_pos.x == 112 && output_pos.y == 0) {
+        for (int i = 0; i < 9; ++i) {
+            half4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+    */
+    write_imageh(output_image, output_pos, output);
+}
+__kernel void conv_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+/*
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
+   }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+  output = activation(output);
+#endif
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}
+/*
+__kernel void conv_1x1_4(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,
+                       __private const int input_height,
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0) * 4;
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE
+    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
+#else
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+    half4 output2 = 0.0f;
+    half4 output3 = 0.0f;
+#endif
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+        output0 = mad(input.x, weight0_0, output0);
+        output0 = mad(input.y, weight0_1, output0);
+        output0 = mad(input.z, weight0_2, output0);
+        output0 = mad(input.w, weight0_3, output0);
+        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
+        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
+        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
+        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
+        output1 = mad(input.x, weight1_0, output1);
+        output1 = mad(input.y, weight1_1, output1);
+        output1 = mad(input.z, weight1_2, output1);
+        output1 = mad(input.w, weight1_3, output1);
+        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
+        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
+        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
+        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
+        output2 = mad(input.x, weight2_0, output2);
+        output2 = mad(input.y, weight2_1, output2);
+        output2 = mad(input.z, weight2_2, output2);
+        output2 = mad(input.w, weight2_3, output2);
+        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
+        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
+        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
+        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
+        output3 = mad(input.x, weight3_0, output3);
+        output3 = mad(input.y, weight3_1, output3);
+        output3 = mad(input.z, weight3_2, output3);
+        output3 = mad(input.w, weight3_3, output3);
+   }
+#ifdef BATCH_NORM
+    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
+#endif
+#ifdef RELU
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
+#endif
+  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos0, output0);
+  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos1, output1);
+  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos2, output2);
+  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos3, output3);
+}
+*/
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define BIASE
+#define BATCH_NORM
+#define RELU
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 in = read_imageh(input, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = in + biase;
+     write_imageh(outputImage,coords,output);
+ }
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
+ {
+        int i = get_global_id(0);
+        int j = get_global_id(1);
+        half4 pixel;
+        pixel.x = convert_half(in[(i * w + j)]);
+        pixel.y = convert_half(in[h * w + (i * w + j)]);
+        pixel.z = convert_half(in[2 * h * w + (i * w + j)]);
+        pixel.w = 0.0;
+        int2 coords;
+        coords.x = j;
+        coords.y = i;
+        write_imageh(outputImage,coords,pixel);
+ }
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void fetch(__private const int in_height,
+                    __private const int in_width,
+                    __read_only image2d_t input,
+                    __global float* out,
+                    __private const int size_ch,
+                    __private const int size_block,
+                    __private const int size_batch) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int pos_x = mad24(in_c, in_width, in_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = convert_float(in.x);
+  out[index + size_ch] = convert_float(in.y);
+  out[index + size_ch * 2] = convert_float(in.z);
+  out[index + size_ch * 3] = convert_float(in.w);
+}
+__kernel void fetch_2d(__private const int in_height,
+                       __private const int in_width,
+                       __read_only image2d_t input,
+                       __global float* out) {
+  const int in_w = get_global_id(1);
+  const int in_h = get_global_id(2);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(in_w, in_h));
+  const int index = (in_h * in_width + in_w) * 4;
+  out[index] = convert_float(in.x);
+  out[index + 1] = convert_float(in.y);
+  out[index + 2] = convert_float(in.z);
+  out[index + 3] = convert_float(in.w);
+}
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define MIN_VALUE -FLT_MAX
+__kernel void pool_max(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 max_value = (half4)(MIN_VALUE);
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      max_value = max(max_value, tmp);
+    }
+  }
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
+}
+__kernel void pool_avg(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 sum = (half4)(0.0f);
+  int num = 0;
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      num++;
+    }
+  }
+  half4 avg = sum / num;
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
+}
--- a/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void relu(__read_only image2d_t input,
+                   __write_only image2d_t output){
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  write_imageh(output, (int2)(x, y), in);
+}
+__kernel void relu_p0(__read_only image2d_t input,
+                   __write_only image2d_t output){
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  write_imageh(output, (int2)(x, y), in);
+}
+__kernel void relu_p1(__read_only image2d_t input,
+                   __write_only image2d_t output){
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  write_imageh(output, (int2)(x, y), in);
+}
--- a/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void reshape(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3,
+                      __private const int x0,
+                      __private const int x1,
+                      __private const int x2,
+                      __private const int x3) {
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(x, y));
+  write_imageh(output, (int2)(x, y), in);
+}
+/*
+__kernel void reshape(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3,
+                      __private const int x0,
+                      __private const int x1,
+                      __private const int x2,
+                      __private const int x3) {
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  int obx = x / x3;
+  int oby = y / x2;
+  int ox = x % x3;
+  int oy = y % x2;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 r;
+  for (int i = 0; i < 4; i++) {
+    int t = obx * 4 + i;
+    if (t > x1) break;
+    int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy;
+    int i3 = oindex % d3; oindex /= d3;
+    int i2 = oindex % d2; oindex /= d2;
+    int i1 = oindex % d1; oindex /= d1;
+    int i0 = oindex;
+    int ix = (i1 / 4) * d3 + i3;
+    int iy = i0 * d2 + i2;
+    half4 p = read_imageh(input, sampler, (int2)(ix, iy));
+    ((half*)&r)[i] = ((half*)&p)[i1%4];
+  }
+  write_imageh(output, (int2)(x, y), r);
+}
+*/
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void softmax(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int group
+                      ) {
+    const int out_c = get_global_id(0);   //  block index
+    const int out_w = get_global_id(1);   // index in one block
+    const int out_nh = get_global_id(2);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half maxv = 0.0f;
+  for (int i = 0; i < group; ++i) {
+    half4 temp = read_imageh(input_image, sampler, (int2)(i, 0));
+    maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
+  }
+  half4 rsum = (half4)(0.0f);
+  for (int i = 0; i < group; ++i) {
+    half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
+    rsum += convert_half4(exp(convert_float4(r - maxv)));
+  }
+  float sum = rsum.x + rsum.y + rsum.z + rsum.w;
+  half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
+  half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
+  write_imageh(output_image, (int2)(out_w, out_nh), result);
+}
+/*
+__kernel void softmax(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const int d0,
+                      __private const int d1,
+                      __private const int d2,
+                      __private const int d3) {
+  const int z = get_global_id(0);
+  const int x = get_global_id(1);
+  const int y = get_global_id(2);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  half4 cv = read_imageh(input, sampler, (int2)(x, y));
+  half4 maxv = cv;
+  for (int i = 0; i < d3; i++) {
+    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
+    maxv = max(maxv, temp);
+  }
+  half4 sum = (half4)0.0f;
+  // half4 x = = (half4)0.0f;
+  for (int i = 0; i < d3; i++) {
+    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
+    sum += exp(temp - maxv);
+  }
+  half4 r = exp(cv - maxv) / sum;
+  write_imageh(output, (int2)(z * d3 + x, y), r);
+}
+*/
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDBNRELU_OP
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "framework/cl/cl_image.h"
+#include "framework/cl/cl_tool.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddBNReluKernel<GPU_CL, float>::Init(
+    FusionConvAddBNReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             cl_helper_.CLCommandQueue());
+  //  const CL *mean = param->InputMean();
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+  const int C = mean->numel();
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " mean - " << j << mean->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " variance - " << j << variance->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " scale - " << j << scale->data<float>()[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " bias - " << j << bias->data<float>()[j];
+  //  }
+  //
+  //  DLOG << " climage mean: " << *mean;
+  //  DLOG << " climage variance: " << *variance;
+  //  DLOG << " climage scale: " << *scale;
+  //  DLOG << " climage bias: " << *bias;
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  framework::CLImage *new_scale = new framework::CLImage();
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //  }
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+  //  DLOG << " climage - y bias: " << *(param->Bias());
+  //
+  //  DLOG << " climage - new scale: " << *new_scale;
+  framework::CLImage *new_bias = new framework::CLImage();
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+  //  DLOG << " climage - new bias: " << *new_bias;
+  //
+  //  DLOG << " climage - filter: " << *(param->Filter());
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  /*
+  if (param->Filter()->dims()[2] == 1 &&
+      param->Filter()->dims()[3] == 1 &&
+      (param->Filter()->dims()[0] % 16) == 0) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1 4";
+  }
+  */
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv 1x1";
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu depth_conv_3x3";
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+    DLOG << " conv add bn relu conv_3x3";
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+  return true;
+}
+template <>
+void ConvAddBNReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddBNReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  //  DLOG << " c block " << c_block;
+  //  DLOG << " w " << w;
+  //  DLOG << " nh " << nh;
+  //  DLOG << " stride " << stride;
+  //  DLOG << " offset " << offset;
+  //  DLOG << " input_c " << input_c;
+  //  DLOG << " dilation " << dilation;
+  //  DLOG << " input width " << input_width;
+  //  DLOG << " input height " << input_height;
+  //  DLOG << " output width " << output_width;
+  //  DLOG << " output height " << output_height;
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  /*
+  if (param.Filter()->dims()[2] == 1 &&
+      param.Filter()->dims()[3] == 1 &&
+      param.Filter()->dims()[0] % 16 == 0) {
+    DLOG << " before modifi work size: " << default_work_size;
+    default_work_size[0] = default_work_size[0] / 4;
+    DLOG << " modification work size: " << default_work_size;
+    DLOG << " input dims " << param.Input()->dims();
+    DLOG << " output dims " << param.Output()->dims();
+    DLOG << " filter dims: " << param.Filter()->dims();
+    DLOG << " biase dims : " << param.Bias()->dims();
+  }
+  */
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvAddBNReluKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+  return true;
+}
+template <>
+void ConvAddKernel<GPU_CL, float>::Compute(
+    const FusionConvAddParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvAddKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADDRELU_OP
+#include "operators/kernel/conv_add_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddReluKernel<GPU_CL, float>::Init(
+    FusionConvAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+  return true;
+}
+template <>
+void ConvAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvAddReluKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#include "operators/kernel/conv_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  auto filter_ddim = param->Filter()->dims();
+  std::vector<int64_t> filter_shape(
+      {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]});
+  framework::DDim ddim = framework::make_ddim(filter_shape);
+  if (filter_ddim[1] == 1) {
+    param->Filter()->Resize(ddim);
+  }
+  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                               this->cl_helper_.CLCommandQueue());
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  DLOG << " init helper: " << &cl_helper_;
+  DLOG << " conv kernel add kernel ~ ";
+  DLOG << " width of one block: " << param->Filter()->dims()[3];
+  DLOG << " height of one block: " << param->Filter()->dims()[2];
+  DLOG << " filter dims: " << param->Filter()->dims();
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    DLOG << " here1 ";
+    this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
+  } else if (param->Filter()->dims()[0] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    DLOG << " here2 ";
+    this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    DLOG << " here3 ";
+    this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+  return true;
+}
+template <>
+void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  cl_int status;
+  DLOG << " begin set kernel arg ";
+  DLOG << " c block " << c_block;
+  DLOG << " w " << w;
+  DLOG << " nh " << nh;
+  DLOG << " stride " << stride;
+  DLOG << " offset " << offset;
+  DLOG << " input_c " << input_c;
+  DLOG << " dilation " << dilation;
+  DLOG << " input width " << input_width;
+  DLOG << " input height " << input_height;
+  DLOG << " output width " << output_width;
+  DLOG << " output height " << output_height;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DEPTHWISECONV_OP
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
+  DLOG << " depthwise conv kernel init begin ";
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                               this->cl_helper_.CLCommandQueue());
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  DLOG << " depthwise conv kernel init end ";
+  return true;
+}
+template <>
+void DepthwiseConvKernel<GPU_CL, float>::Compute(
+    const ConvParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class DepthwiseConvKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseAddKernel<GPU_CL, float>::Init(
+    ElementwiseAddParam<GPU_CL> *param) {
+  DLOG << "-----init add-----";
+  CLImage *bias = (CLImage *)(param->InputY());
+  bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
+  DLOG << " bias: " << *bias;
+  if (bias->dims().size() == 4) {
+    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  } else if (param->InputY()->dims().size() == 1) {
+    this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+  return true;
+}
+template <>
+void ElementwiseAddKernel<GPU_CL, float>::Compute(
+    const ElementwiseAddParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    int width = input->ImageWidth();
+    int height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 1) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[3];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int), reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    int width = input->ImageWidth();
+    int height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    cl_event out_event = param.Out()->GetClEvent();
+    cl_event wait_event = param.InputX()->GetClEvent();
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims is error";
+  }
+}
+template class ElementwiseAddKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+#include "framework/cl/cl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
+  DLOG << "Init feed";
+  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
+  return true;
+}
+template <>
+void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  cl_int status;
+  auto output = param.Out();
+  const Tensor *input = param.InputX();
+  //  DLOG << *input;
+  const float *input_data = input->data<float>();
+  int numel = input->numel();
+  cl_mem cl_image = output->GetCLImage();
+  int height = output->dims()[2];
+  int width = output->dims()[3];
+  CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
+                           this->cl_helper_.CLCommandQueue());
+  input_cl_tensor.Resize(input->dims());
+  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
+  CL_CHECK_ERRORS(status);
+  size_t global_work_size[2] = {width, height};
+  //  cl_event out_event = param.Out()->GetClEvent();
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                  NULL, global_work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class FeedKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/fetch_kernel.h"
+#include "framework/cl/cl_tensor.h"
+// #include "common/common.h"
+// #include <iostream>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
+  if (param->InputX()->dims().size() <= 2) {
+    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  } else {
+    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  }
+  auto *out = param->Out();
+  out->mutable_data<float>();
+  return true;
+}
+template <>
+void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+  auto input = param.InputX()->GetCLImage();
+  auto *out = param.Out();
+  const auto &dim = param.InputX()->dims();
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < dim.size(); ++j) {
+    new_dims[4 - dim.size() + j] = dim[j];
+  }
+  size_t C, in_height, in_width;
+  C = new_dims[1];
+  in_height = new_dims[2];
+  if (dim.size() <= 2) {
+    in_width = param.InputX()->ImageWidth();
+  } else {
+    in_width = new_dims[3];
+  }
+  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
+                         this->cl_helper_.CLCommandQueue());
+  out_cl_tensor.Resize(out->dims());
+  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
+  clSetKernelArg(kernel, 0, sizeof(int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
+  if (dim.size() > 2) {
+    int size_ch = in_height * in_width;
+    int size_block = size_ch * 4;
+    int size_batch = size_ch * C;
+    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  }
+  //  cl_event wait_event = param.InpdutX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+  //  auto time1 = paddle_mobile::time();
+  //  printf(" before finish \n");
+  //  clFlsh(this->cl_helper_.CLCommandQueue());
+  clFinish(this->cl_helper_.CLCommandQueue());
+  //  printf(" after finish \n");
+  //  auto time2 = paddle_mobile::time();
+  //
+  //
+  //  std::cout << " finish  cost :" << paddle_mobile::time_diff(time1, time2)
+  //            << "ms" << std::endl;
+  memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
+}
+template class FetchKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/cl/pool_kernel.cpp
+++ b/src/operators/kernel/cl/pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#include "operators/kernel/pool_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
+  std::string pooling_type = param->PoolingType();
+  this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
+  return true;
+}
+template <>
+void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  auto input = param.Input()->GetCLImage();
+  auto out = param.Output()->GetCLImage();
+  framework::CLImageConverterFolder *input_folder_converter =
+      reinterpret_cast<framework::CLImageConverterFolder *>(
+          param.Input()->Converter());
+  framework::CLImageConverterFolder *output_folder_converter =
+      reinterpret_cast<framework::CLImageConverterFolder *>(
+          param.Output()->Converter());
+  const int in_height = input_folder_converter->HeightOfOneBlock();
+  const int in_width = input_folder_converter->WidthOfOneBlock();
+  const int out_height = output_folder_converter->HeightOfOneBlock();
+  const int out_width = output_folder_converter->WidthOfOneBlock();
+  std::string pooling_type = param.PoolingType();
+  std::vector<int> ksize = param.Ksize();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  const int pad_top = paddings[0];
+  const int pad_left = paddings[1];
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  const int ksize_h = ksize[0];
+  const int ksize_w = ksize[1];
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
+  clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
+  clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
+  clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
+  clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
+  clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
+  clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}
+template class PoolKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/relu_kernel.cpp
+++ b/src/operators/kernel/cl/relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#include "operators/kernel/relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("relu", "relu.cl");
+  //  this->cl_helper_.AddKernel("relu_p0", "relu.cl");
+  //  this->cl_helper_.AddKernel("relu_p1", "relu.cl");
+  //  const auto dim =
+  //      const_cast<framework::CLImage*>(param->InputX())->ImageDims();
+  //  param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
+  //                                      this->cl_helper_.CLCommandQueue(),
+  //                                      dim);
+  return true;
+}
+template <>
+void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  //  auto kernel_p0 = this->cl_helper_.KernelAt(1);
+  //  auto kernel_p1 = this->cl_helper_.KernelAt(2);
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  //  auto tImage =
+  //      const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
+  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
+  //  clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
+  //  clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
+  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
+  //  NULL,
+  //                         work_size, NULL, 0, NULL, NULL);
+}
+template class ReluKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("reshape", "reshape.cl");
+  return true;
+}
+template <>
+void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  const auto *input = param.InputX();
+  auto *output = param.Out();
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  const auto &inputDim = input->dims();
+  const auto &outputDim = output->dims();
+  int dims[4] = {1, 1, 1, 1};
+  int odims[4] = {1, 1, 1, 1};
+  // 1 1000 1 1
+  for (int i = 0; i < inputDim.size(); i++) {
+    dims[4 - inputDim.size() + i] = inputDim[i];
+  }
+  // 1 1 1 1000
+  for (int i = 0; i < outputDim.size(); i++) {
+    odims[4 - outputDim.size() + i] = outputDim[i];
+  }
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
+  clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
+  clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
+  clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
+  clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
+  clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
+  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+}
+template class ReshapeKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#include "operators/kernel/softmax_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SoftmaxKernel<GPU_CL, float>::Init(SoftmaxParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("softmax", "softmax.cl");
+  return true;
+}
+template <>
+void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  const auto *input = param.InputX();
+  auto *output = param.Out();
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  int group = output->ImageWidth();
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &group);
+  //  const auto &inputDim = input->dims();
+  //
+  //  int dims[4] = {1, 1, 1, 1};
+  //
+  //  for (int i = 0; i < inputDim.size(); i++) {
+  //    dims[4 - inputDim.size() + i] = inputDim[i];
+  //  }
+  //
+  //  clSetKernelArg(kernel, 2, sizeof(int), &dims);
+  //  clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
+  //  clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
+  //  clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
+  //  cl_event out_event = param.Out()->GetClEvent();
+  //  cl_event wait_event = param.InputX()->GetClEvent();
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class SoftmaxKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ConcatKernel
    : public framework::OpKernelBase<DeviceType, ConcatParam<DeviceType>> {
 public:
-  void Compute(const ConcatParam<DeviceType> &param) const;
+  void Compute(const ConcatParam<DeviceType> &param);
  bool Init(ConcatParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddAddPReluKernel
    : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param);
  bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddBNKernel
    : public OpKernelBase<DeviceType, FusionConvAddBNParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddBNParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddBNParam<DeviceType> &param);
  bool Init(FusionConvAddBNParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddBNReluKernel
    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -40,7 +40,7 @@ template <typename DeviceType, typename T>
 class ConvAddKernel
    : public OpKernelBase<DeviceType, FusionConvAddParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddParam<DeviceType> &param);
  bool Init(FusionConvAddParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddPReluKernel
    : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddPReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddPReluParam<DeviceType> &param);
  bool Init(FusionConvAddPReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvAddReluKernel
    : public OpKernelBase<DeviceType, FusionConvAddReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvAddReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvAddReluParam<DeviceType> &param);
  bool Init(FusionConvAddReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNAddReluKernel
    : public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvBNAddReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNAddReluParam<DeviceType> &param);
  bool Init(FusionConvBNAddReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNKernel
    : public OpKernelBase<DeviceType, FusionConvBNParam<DeviceType>> {
 public:
-  void Compute(const FusionConvBNParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNParam<DeviceType> &param);
  bool Init(FusionConvBNParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class ConvBNReluKernel
    : public OpKernelBase<DeviceType, FusionConvBNReluParam<DeviceType>> {
 public:
-  void Compute(const FusionConvBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionConvBNReluParam<DeviceType> &param);
  bool Init(FusionConvBNReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -31,7 +31,7 @@ using framework::OpKernelBase;
 template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
 public:
-  void Compute(const ConvParam<DeviceType> &param) const;
+  void Compute(const ConvParam<DeviceType> &param);
  bool Init(ConvParam<DeviceType> *param);
 };

--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class ConvTransposeKernel
    : public OpKernelBase<DeviceType, ConvTransposeParam<DeviceType>> {
 public:
-  void Compute(const ConvTransposeParam<DeviceType> &param) const;
+  void Compute(const ConvTransposeParam<DeviceType> &param);
  bool Init(ConvTransposeParam<DeviceType> *param);
 };

--- a/src/operators/kernel/crf_kernel.h
+++ b/src/operators/kernel/crf_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class CrfKernel
    : public framework::OpKernelBase<DeviceType, CrfParam<DeviceType>> {
 public:
-  void Compute(const CrfParam<DeviceType>& param) const;
+  void Compute(const CrfParam<DeviceType>& param);
  bool Init(CrfParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -31,7 +31,7 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel
    : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
 public:
-  void Compute(const ConvParam<DeviceType> &param) const;
+  void Compute(const ConvParam<DeviceType> &param);
  bool Init(ConvParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class DequantizeKernel
    : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
 public:
-  void Compute(const DequantizeParam<DeviceType> &param) const;
+  void Compute(const DequantizeParam<DeviceType> &param);
  bool Init(DequantizeParam<DeviceType> *param);
 };

--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class DropoutKernel
    : public framework::OpKernelBase<DeviceType, DropoutParam<DeviceType>> {
 public:
-  void Compute(const DropoutParam<DeviceType>& param) const;
+  void Compute(const DropoutParam<DeviceType>& param);
  bool Init(DropoutParam<DeviceType>* para);
 };
 }  // namespace operators

--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -35,7 +35,7 @@ template <typename DeviceType, typename T>
 class DWConvBNReluKernel
    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam<DeviceType>> {
 public:
-  void Compute(const FusionDWConvBNReluParam<DeviceType> &param) const;
+  void Compute(const FusionDWConvBNReluParam<DeviceType> &param);
  bool Init(FusionDWConvBNReluParam<DeviceType> *param);
 };

--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -30,7 +30,7 @@ class ElementwiseAddKernel
    : public framework::OpKernelBase<DeviceType,
                                     ElementwiseAddParam<DeviceType>> {
 public:
-  void Compute(const ElementwiseAddParam<DeviceType> &param) const;
+  void Compute(const ElementwiseAddParam<DeviceType> &param);
  bool Init(ElementwiseAddParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/elementwise_add_relu_kernel.h
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
@@ -29,7 +29,7 @@ class ElementwiseAddReluKernel
    : public framework::OpKernelBase<DeviceType,
                                     ElementwiseAddReluParam<DeviceType>> {
 public:
-  void Compute(const ElementwiseAddReluParam<DeviceType> &param) const;
+  void Compute(const ElementwiseAddReluParam<DeviceType> &param);
  bool Init(ElementwiseAddReluParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
@@ -28,7 +28,7 @@ class ElementwiseMulKernel
    : public framework::OpKernelBase<DeviceType,
                                     ElementwiseMulParam<DeviceType>> {
 public:
-  void Compute(const ElementwiseMulParam<DeviceType> &param) const;
+  void Compute(const ElementwiseMulParam<DeviceType> &param);
  bool Init(ElementwiseMulParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/elementwise_sub_kernel.h
+++ b/src/operators/kernel/elementwise_sub_kernel.h
@@ -28,7 +28,7 @@ class ElementwiseSubKernel
    : public framework::OpKernelBase<DeviceType,
                                     ElementwiseSubParam<DeviceType>> {
 public:
-  void Compute(const ElementwiseSubParam<DeviceType> &param) const;
+  void Compute(const ElementwiseSubParam<DeviceType> &param);
  bool Init(ElementwiseSubParam<DeviceType> *param);
 };

--- a/src/operators/kernel/fc_relu_kernel.h
+++ b/src/operators/kernel/fc_relu_kernel.h
@@ -28,7 +28,7 @@ class FusionFcReluKernel
    : public framework::OpKernelBase<DeviceType,
                                     FusionFcReluParam<DeviceType>> {
 public:
-  void Compute(const FusionFcReluParam<DeviceType>& param) const;
+  void Compute(const FusionFcReluParam<DeviceType>& param);
  bool Init(FusionFcReluParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class FeedKernel
+    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
+ public:
+  void Compute(const FeedParam<DeviceType> &param);
+  bool Init(FeedParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fetch_kernel.h
+++ b/src/operators/kernel/fetch_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class FetchKernel
+    : public framework::OpKernelBase<DeviceType, FetchParam<DeviceType>> {
+ public:
+  void Compute(const FetchParam<DeviceType> &param);
+  bool Init(FetchParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/flatten_kernel.h
+++ b/src/operators/kernel/flatten_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class FlattenKernel
    : public framework::OpKernelBase<DeviceType, FlattenParam<DeviceType>> {
 public:
-  void Compute(const FlattenParam<DeviceType>& param) const;
+  void Compute(const FlattenParam<DeviceType>& param);
  bool Init(FlattenParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -58,7 +58,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
 }
 template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
  ComputeFPGAConcat(param.FpgaArgs());
 }
 template class ConcatKernel<FPGA, float>;

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -78,7 +78,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
 template <>
 void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) const {
+    const FusionConvAddBNParam<FPGA> &param) {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -76,7 +76,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 template <>
 void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) const {
+    const FusionConvAddBNReluParam<FPGA> &param) {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
 template <>
 void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) const {
+    const FusionConvAddReluParam<FPGA> &param) {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -69,8 +69,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
 }
 template <>
-void ConvBNKernel<FPGA, float>::Compute(
+void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
-    const FusionConvBNParam<FPGA> &param) const {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -70,7 +70,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
 template <>
 void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) const {
+    const FusionConvBNReluParam<FPGA> &param) {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
@@ -26,8 +26,7 @@ bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
 }
 template <>
-void DropoutKernel<FPGA, float>::Compute(
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
-    const DropoutParam<FPGA> &param) const {}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -56,7 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
 template <>
 void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) const {
+    const ElementwiseAddReluParam<FPGA> &param) {
  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -61,7 +61,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
 }
 template <>
 void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) const {
+    const FusionFcReluParam<FPGA> &param) {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }

--- a/src/operators/kernel/fpga/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/fetch_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
+  return true;
+}
+template <>
+void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+template class FetchKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -62,8 +62,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
 }
 template <>
-void FusionFcKernel<FPGA, float>::Compute(
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
-    const FusionFcParam<FPGA> &param) const {
  fpga::ComputeFpgaConv(param.FpgaArgs());
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -53,7 +53,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 }
 template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const {
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
  fpga::ComputeFpgaPool(param.FpgaArgs());
 }
 }  // namespace operators

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -47,8 +47,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
 }
 template <>
-void SoftmaxKernel<FPGA, float>::Compute(
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-    const SoftmaxParam<FPGA> &param) const {
  Tensor *in_x = param.FloatInput();
  Tensor *out = param.Out();

--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class FusionFcKernel
    : public framework::OpKernelBase<DeviceType, FusionFcParam<DeviceType>> {
 public:
-  void Compute(const FusionFcParam<DeviceType>& param) const;
+  void Compute(const FusionFcParam<DeviceType>& param);
  bool Init(FusionFcParam<DeviceType>* param);
 };

--- a/src/operators/kernel/gru_kernel.h
+++ b/src/operators/kernel/gru_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class GruKernel
    : public framework::OpKernelBase<DeviceType, GruParam<DeviceType>> {
 public:
-  void Compute(const GruParam<DeviceType>& param) const;
+  void Compute(const GruParam<DeviceType>& param);
  bool Init(GruParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class Im2SequenceKernel
    : public framework::OpKernelBase<DeviceType, Im2SequenceParam<DeviceType>> {
 public:
-  void Compute(const Im2SequenceParam<DeviceType>& param) const;
+  void Compute(const Im2SequenceParam<DeviceType>& param);
  bool Init(Im2SequenceParam<DeviceType>* para);
 };
 }  // namespace operators

--- a/src/operators/kernel/lookup_kernel.h
+++ b/src/operators/kernel/lookup_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class LookupKernel
    : public framework::OpKernelBase<DeviceType, LookupParam<DeviceType>> {
 public:
-  void Compute(const LookupParam<DeviceType>& param) const;
+  void Compute(const LookupParam<DeviceType>& param);
  bool Init(LookupParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 #ifdef LRN_OP
 #ifdef _OPENMP
 #include <omp.h>
@@ -173,7 +175,7 @@ template <typename DeviceType, typename T>
 class LrnKernel
    : public framework::OpKernelBase<DeviceType, LrnParam<DeviceType>> {
 public:
-  void Compute(const LrnParam<DeviceType> &param) const;
+  void Compute(const LrnParam<DeviceType> &param);
  bool Init(LrnParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -145,7 +145,7 @@ bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam<GPU_MALI>* param) {
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam<GPU_MALI>& param) const {
+    const BatchNormParam<GPU_MALI>& param) {
  std::cout << "init acl" << std::endl;
  AclBatchNormOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -118,7 +118,7 @@ bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam<GPU_MALI>* param) {
 template <>
 void ConcatKernel<GPU_MALI, float>::Compute(
-    const ConcatParam<GPU_MALI>& param) const {
+    const ConcatParam<GPU_MALI>& param) {
  std::cout << "init acl" << std::endl;
  AclConcatOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -212,7 +212,7 @@ bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam<GPU_MALI>* param) {
 template <>
 void ConvAddKernel<GPU_MALI, float>::Compute(
-    const FusionConvAddParam<GPU_MALI>& param) const {
+    const FusionConvAddParam<GPU_MALI>& param) {
  std::cout << "init acl" << std::endl;
  AclConvAddOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -211,8 +211,7 @@ bool ConvKernel<GPU_MALI, float>::Init(ConvParam<GPU_MALI>* param) {
 }
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam<GPU_MALI>& param) {
-    const ConvParam<GPU_MALI>& param) const {
  std::cout << "init acl" << std::endl;
  AclConvOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -34,7 +34,7 @@ bool ElementwiseAddKernel<GPU_MALI, float>::Init(
 template <>
 void ElementwiseAddKernel<GPU_MALI, float>::Compute(
-    const ElementwiseAddParam<GPU_MALI> &param) const {
+    const ElementwiseAddParam<GPU_MALI> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *Out = param.Out();

--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/feed_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
+  return true;
+}
+template <>
+void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+  param.Out()->set_lod(param.InputX()->lod());
+}
+template class FeedKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fetch_kernel.cpp
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/fetch_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FetchKernel<GPU_MALI, float>::Init(FetchParam<GPU_MALI> *param) {
+  return true;
+}
+template <>
+void FetchKernel<GPU_MALI, float>::Compute(const FetchParam<GPU_MALI> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+}
+template class FetchKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -26,7 +26,7 @@ bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam<GPU_MALI> *param) {
 template <>
 void FusionFcKernel<GPU_MALI, float>::Compute(
-    const FusionFcParam<GPU_MALI> &param) const {
+    const FusionFcParam<GPU_MALI> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  const Tensor *input_z = param.InputZ();

--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -127,8 +127,7 @@ bool LrnKernel<GPU_MALI, float>::Init(LrnParam<GPU_MALI>* param) {
 }
 template <>
-void LrnKernel<GPU_MALI, float>::Compute(
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam<GPU_MALI>& param) {
-    const LrnParam<GPU_MALI>& param) const {
  std::cout << "init acl" << std::endl;
  AclLrnOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -27,8 +27,7 @@ bool MulKernel<GPU_MALI, float>::Init(MulParam<GPU_MALI> *param) {
 }
 template <>
-void MulKernel<GPU_MALI, float>::Compute(
+void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
-    const MulParam<GPU_MALI> &param) const {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *out = param.Out();

--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -195,8 +195,7 @@ bool PoolKernel<GPU_MALI, float>::Init(PoolParam<GPU_MALI>* param) {
 }
 template <>
-void PoolKernel<GPU_MALI, float>::Compute(
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam<GPU_MALI>& param) {
-    const PoolParam<GPU_MALI>& param) const {
  std::cout << "init acl" << std::endl;
  AclPoolOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -115,8 +115,7 @@ bool ReluKernel<GPU_MALI, float>::Init(ReluParam<GPU_MALI>* param) {
 }
 template <>
-void ReluKernel<GPU_MALI, float>::Compute(
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam<GPU_MALI>& param) {
-    const ReluParam<GPU_MALI>& param) const {
  std::cout << "init acl" << std::endl;
  AclReluOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -28,7 +28,7 @@ bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam<GPU_MALI> *param) {
 template <>
 void ReshapeKernel<GPU_MALI, float>::Compute(
-    const ReshapeParam<GPU_MALI> &param) const {
+    const ReshapeParam<GPU_MALI> &param) {
  const auto *input_x = param.InputX();
  const auto &input_x_dims = input_x->dims();
  auto *out = param.Out();

--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -113,7 +113,7 @@ bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam<GPU_MALI>* param) {
 template <>
 void SoftmaxKernel<GPU_MALI, float>::Compute(
-    const SoftmaxParam<GPU_MALI>& param) const {
+    const SoftmaxParam<GPU_MALI>& param) {
  std::cout << "init acl" << std::endl;
  AclSoftmaxOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());

--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class MulKernel
    : public framework::OpKernelBase<DeviceType, MulParam<DeviceType>> {
 public:
-  void Compute(const MulParam<DeviceType> &param) const;
+  void Compute(const MulParam<DeviceType> &param);
  bool Init(MulParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -28,7 +28,7 @@ class MultiClassNMSKernel
    : public framework::OpKernelBase<DeviceType,
                                     MultiClassNMSParam<DeviceType>> {
 public:
-  void Compute(const MultiClassNMSParam<DeviceType>& param) const;
+  void Compute(const MultiClassNMSParam<DeviceType>& param);
  bool Init(MultiClassNMSParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
@@ -27,7 +27,7 @@ class PolygonBoxTransformKernel
    : public framework::OpKernelBase<DeviceType,
                                     PolygonBoxTransformParam<DeviceType>> {
 public:
-  void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
+  void Compute(const PolygonBoxTransformParam<DeviceType>& param);
  bool Init(PolygonBoxTransformParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -26,7 +26,7 @@ using framework::OpKernelBase;
 template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam<DeviceType>> {
 public:
-  void Compute(const PoolParam<DeviceType> &param) const override;
+  void Compute(const PoolParam<DeviceType> &param);
  bool Init(PoolParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
@@ -24,7 +24,7 @@ template <typename DeviceType, typename T>
 class PReluKernel
    : public framework::OpKernelBase<DeviceType, PReluParam<DeviceType>> {
 public:
-  void Compute(const PReluParam<DeviceType>& param) const;
+  void Compute(const PReluParam<DeviceType>& param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -54,7 +54,7 @@ template <typename DeviceType, typename T>
 class PriorBoxKernel
    : public framework::OpKernelBase<DeviceType, PriorBoxParam<DeviceType>> {
 public:
-  void Compute(const PriorBoxParam<DeviceType>& param) const;
+  void Compute(const PriorBoxParam<DeviceType>& param);
  bool Init(PriorBoxParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class QuantizeKernel
    : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
 public:
-  void Compute(const QuantizeParam<DeviceType> &param) const;
+  void Compute(const QuantizeParam<DeviceType> &param);
  bool Init(QuantizeParam<DeviceType> *param);
 };

--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ReluKernel
    : public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
 public:
-  void Compute(const ReluParam<DeviceType>& param) const;
+  void Compute(const ReluParam<DeviceType>& param);
  bool Init(ReluParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/reshape2_kernel.h
+++ b/src/operators/kernel/reshape2_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class Reshape2Kernel
    : public framework::OpKernelBase<DeviceType, Reshape2Param<DeviceType>> {
 public:
-  void Compute(const Reshape2Param<DeviceType>& param) const;
+  void Compute(const Reshape2Param<DeviceType>& param);
  bool Init(Reshape2Param<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -71,7 +71,7 @@ template <typename DeviceType, typename T>
 class ReshapeKernel
    : public framework::OpKernelBase<DeviceType, ReshapeParam<DeviceType>> {
 public:
-  void Compute(const ReshapeParam<DeviceType>& param) const;
+  void Compute(const ReshapeParam<DeviceType>& param);
  bool Init(ReshapeParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
@@ -74,7 +74,7 @@ template <typename DeviceType, typename T>
 class ResizeKernel
    : public framework::OpKernelBase<DeviceType, ResizeParam<DeviceType>> {
 public:
-  void Compute(const ResizeParam<DeviceType> &param) const;
+  void Compute(const ResizeParam<DeviceType> &param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
@@ -24,7 +24,7 @@ template <typename DeviceType, typename T>
 class ScaleKernel
    : public framework::OpKernelBase<DeviceType, ScaleParam<DeviceType>> {
 public:
-  void Compute(const ScaleParam<DeviceType>& param) const;
+  void Compute(const ScaleParam<DeviceType>& param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/shape_kernel.h
+++ b/src/operators/kernel/shape_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class ShapeKernel
    : public framework::OpKernelBase<DeviceType, ShapeParam<DeviceType>> {
 public:
-  void Compute(const ShapeParam<DeviceType>& param) const;
+  void Compute(const ShapeParam<DeviceType>& param);
  bool Init(ShapeParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class SigmoidKernel
    : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
 public:
-  void Compute(const SigmoidParam<DeviceType>& param) const override;
+  void Compute(const SigmoidParam<DeviceType>& param);
  bool Init(SigmoidParam<DeviceType>* param);
 };

--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -24,7 +24,7 @@ template <typename DeviceType, typename T>
 class SliceKernel
    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
 public:
-  void Compute(const SliceParam<DeviceType>& param) const {}
+  void Compute(const SliceParam<DeviceType>& param) {}
 };
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class SoftmaxKernel
    : public OpKernelBase<DeviceType, SoftmaxParam<DeviceType>> {
 public:
-  void Compute(const SoftmaxParam<DeviceType> &param) const override;
+  void Compute(const SoftmaxParam<DeviceType> &param);
  bool Init(SoftmaxParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/src/operators/kernel/split_kernel.h
+++ b/src/operators/kernel/split_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class SplitKernel
    : public framework::OpKernelBase<DeviceType, SplitParam<DeviceType>> {
 public:
-  void Compute(const SplitParam<DeviceType>& param) const;
+  void Compute(const SplitParam<DeviceType>& param);
  bool Init(SplitParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
@@ -25,7 +25,7 @@ template <typename DeviceType, typename T>
 class SumKernel
    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
 public:
-  void Compute(const SumParam<DeviceType> &param) const;
+  void Compute(const SumParam<DeviceType> &param);
  bool Init(SumParam<DeviceType> *param);
 };

--- a/src/operators/kernel/transpose2_kernel.h
+++ b/src/operators/kernel/transpose2_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class Transpose2Kernel
    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
 public:
-  void Compute(const Transpose2Param<DeviceType>& param) const;
+  void Compute(const Transpose2Param<DeviceType>& param);
  bool Init(Transpose2Param<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -28,7 +28,7 @@ template <typename DeviceType, typename T>
 class TransposeKernel
    : public framework::OpKernelBase<DeviceType, TransposeParam<DeviceType>> {
 public:
-  void Compute(const TransposeParam<DeviceType>& param) const;
+  void Compute(const TransposeParam<DeviceType>& param);
  bool Init(TransposeParam<DeviceType>* param);
 };
 }  // namespace operators

--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
@@ -37,10 +37,6 @@ class LookupOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
                                      operators::LookupKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, LookupParam<DeviceType>,
-      operators::LookupKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -35,10 +35,6 @@ class LrnOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
                                      operators::LrnKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, LrnParam<DeviceType>,
-      operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -35,10 +35,6 @@ class MulOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
                                      operators::MulKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, MulParam<DeviceType>,
-      operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -40,10 +40,6 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
            DeviceType, MultiClassNMSParam<DeviceType>,
            operators::MultiClassNMSKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, MultiClassNMSParam<DeviceType>,
-      operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "fpga/api.h"
 #endif
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif
 namespace paddle_mobile {
 namespace operators {
@@ -48,6 +52,17 @@ struct DtypeTensorTrait {
  typedef framework::Tensor rtype;
 };
+#ifdef PADDLE_MOBILE_CL
+template <>
+struct DtypeTensorTrait<GPU_CL> {
+  // This is the type we obtained in variable.
+  typedef framework::CLImage gtype;
+  // This type will be the parent class type
+  // or the same type.
+  typedef framework::CLImage rtype;
+};
+#endif
 class OpParam {
 protected:
  template <typename T>
@@ -397,6 +412,13 @@ class ConvParam : public OpParam {
  const int &Groups() const { return groups; }
+#ifdef PADDLE_MOBILE_CL
+  int Offset() const { return offset_; }
+  int SetOffset(int in_offset) { offset_ = in_offset; }
+#endif
 private:
  RType *input_;
  RType *output_;
@@ -405,6 +427,10 @@ class ConvParam : public OpParam {
  vector<int> paddings_;
  vector<int> dilations_;
  int groups;
+#ifdef PADDLE_MOBILE_CL
+  int offset_;
+#endif
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
@@ -715,6 +741,14 @@ class BatchNormParam : OpParam {
  const string &DataFormat() const { return data_format_; }
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+  const RType *NewScale() const { return new_scale_; }
+  const RType *NewBias() const { return new_bias_; }
 private:
  RType *input_x_;
  RType *output_y_;
@@ -726,6 +760,8 @@ class BatchNormParam : OpParam {
  float momentum_;
  bool is_test_;
  string data_format_;
+  RType *new_bias_;
+  RType *new_scale_;
 };
 #endif
@@ -1034,18 +1070,18 @@ class FeedParam : public OpParam {
 public:
  FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope) {
+            const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<GType>(outputs, *scope);
+    out_ = OutFrom<GType>(outputs, scope);
-    auto var = scope->Var("batch_size");
+    auto var = scope.FindVar("batch_size");
    batch_size = var->GetValue<int>();
  }
-  const GType *InputX() const { return input_x_; }
+  const LoDTensor *InputX() const { return input_x_; }
  GType *Out() const { return out_; }
  const int BatchSize() const { return batch_size; }
 private:
-  GType *input_x_;
+  LoDTensor *input_x_;
  GType *out_;
  int batch_size;
 };
@@ -1059,14 +1095,19 @@ class FetchParam : public OpParam {
  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
             const AttributeMap &attrs, const Scope &scope) {
    input_x_ = InputXFrom<GType>(inputs, scope);
-    out_ = OutFrom<GType>(outputs, scope);
+    out_ = OutFrom(outputs, scope);
  }
  const RType *InputX() const { return input_x_; }
-  RType *Out() const { return out_; }
+  Tensor *Out() const { return out_; }
+  static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
+    return GetVarValue<LoDTensor>("Out", outputs, scope);
+  }
 private:
  RType *input_x_;
-  RType *out_;
+  Tensor *out_;
 };
 #ifdef FILL_CONSTANT_OP
@@ -1447,13 +1488,13 @@ class ResizeParam : public OpParam {
 * @b op 层实例化好这个 param 传递给 kernel 层使用
 * */
 template <typename Dtype>
-class ReluParam : public OpParam {
+class ReluParamBase : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 public:
-  ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+  ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, const Scope &scope) {
+                const AttributeMap &attrs, const Scope &scope) {
    input_x_ = InputXFrom<GType>(inputs, scope);
    out_ = OutFrom<GType>(outputs, scope);
  }
@@ -1466,6 +1507,25 @@ class ReluParam : public OpParam {
  RType *input_x_;
  RType *out_;
 };
+template <typename Dtype>
+class ReluParam : public ReluParamBase<Dtype> {
+ public:
+  using ReluParamBase<Dtype>::ReluParamBase;
+};
+#ifdef PADDLE_MOBILE_CL
+template <>
+class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
+ public:
+  using ReluParamBase<GPU_CL>::ReluParamBase;
+  framework::CLImage &getMidImage() { return midImage; }
+ private:
+  framework::CLImage midImage;
+};
+#endif
 #endif
 #ifdef PRELU_OP
@@ -1764,6 +1824,7 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
  bool is_test_;
  RType *new_bias_;
  RType *new_scale_;
 #ifdef PADDLE_MOBILE_FPGA
 private:

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 #ifdef POOL_OP
-#include "pool_op.h"
+#include <vector>
+#include "operators/pool_op.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
@@ -68,5 +69,8 @@ REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(pool2d, ops::PoolOp);
+#endif
 #endif
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -38,9 +38,6 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
      : OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
                           operators::PoolKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using OperatorWithKernel<
-      DeviceType, PoolParam<DeviceType>,
-      operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 private:

--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -38,10 +38,6 @@ class PReluOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
                                      operators::PReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, PReluParam<DeviceType>,
-      operators::PReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -40,9 +40,6 @@ class PriorBoxOp : public framework::OperatorWithKernel<
                                      operators::PriorBoxKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, PriorBoxParam<DeviceType>,
-      operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -39,5 +39,10 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(relu, ops::ReluOp);
+#endif
 #endif
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -41,10 +41,6 @@ class ReluOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ReluParam<DeviceType>,
                                      operators::ReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ReluParam<DeviceType>,
-      operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,5 +38,8 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
+#endif
 #endif
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -39,10 +39,6 @@ class ReshapeOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
                                      operators::ReshapeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ReshapeParam<DeviceType>,
-      operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
@@ -38,10 +38,6 @@ class ResizeOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
                                      operators::ResizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ResizeParam<DeviceType>,
-      operators::ResizeKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
@@ -38,10 +38,6 @@ class ScaleOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
                                      operators::ScaleKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ScaleParam<DeviceType>,
-      operators::ScaleKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
@@ -38,10 +38,6 @@ class ShapeOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
                                      operators::ShapeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, ShapeParam<DeviceType>,
-      operators::ShapeKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -36,11 +36,6 @@ class SigmoidOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, SigmoidParam<DeviceType>,
                                      operators::SigmoidKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, SigmoidParam<DeviceType>,
-      operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
@@ -38,10 +38,6 @@ class SliceOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
                                      operators::SliceKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, SliceParam<DeviceType>,
-      operators::SliceKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -36,5 +36,8 @@ REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp);
+#endif
 #endif
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -36,11 +36,6 @@ class SoftmaxOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
                                      operators::SoftmaxKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, SoftmaxParam<DeviceType>,
-      operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 private:

--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
@@ -38,10 +38,6 @@ class SplitOp : public framework::OperatorWithKernel<
      : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
                                      operators::SplitKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, SplitParam<DeviceType>,
-      operators::SplitKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };
 }  // namespace operators

--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -40,10 +40,6 @@ class TransposeOp : public framework::OperatorWithKernel<
            DeviceType, TransposeParam<DeviceType>,
            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
                                                       attrs, scope) {}
-  using framework::OperatorWithKernel<
-      DeviceType, TransposeParam<DeviceType>,
-      operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 };

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -342,6 +342,13 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
    target_link_libraries(test-fssd paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-mobilenetgpu paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-yologpu paddle-mobile)
    # gen test
    ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
@@ -351,6 +358,5 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
    target_link_libraries(test-eng paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "common/log.h"
+#include "framework/executor.h"
 #include "framework/op_registry.h"
-#include "io/executor.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -29,9 +29,9 @@ limitations under the License. */
 #include "operators/softmax_op.h"
 #include "operators/transpose_op.h"
-using paddle_mobile::Executor;
 using paddle_mobile::framework::BlockDesc;
 using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Executor;
 using paddle_mobile::framework::LoDTensor;
 using paddle_mobile::framework::OpDesc;
 using paddle_mobile::framework::Program;

--- a/test/fpga/test_concat_op.cpp
+++ b/test/fpga/test_concat_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/concat_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::FPGA> loader;
  auto program = loader.Load(g_googlenet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <iostream>
 #include <string>
 #include "../test_helper.h"
-#include "io/loader.h"
+#include "framework/loader.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  //  auto program = loader.Load(g_googlenet, true);
  //  auto program = loader.Load(g_mobilenet_ssd, true);
-  auto program = loader.Load(std::string(g_ocr) + "/model",
+  //  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params", false);
+  //                             std::string(g_ocr) + "/params", false);
  //  program.originProgram->Description("program desc: ");
  return 0;
 }
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "../test_helper.h"
+#include "framework/loader.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/loader.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
  auto program = loader.Load(g_mobilenet_ssd, true);
  paddle_mobile::framework::ProgramOptimize optimize;

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -29,8 +29,9 @@ int main() {
  bool optimize = true;
  auto time1 = time();
  if (paddle_mobile.Load(g_googlenet, optimize)) {
-    auto time2 = time();
+    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
    std::vector<float> input;
    std::vector<float> output;
    std::vector<int64_t> dims{1, 3, 224, 224};

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,14 +19,15 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
+  auto time1 = paddle_mobile::time();
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
  //                     std::string(g_mobilenet_detect) + "/params", true);
  auto isok = paddle_mobile.Load(g_mobilenet, true);
  if (isok) {
-    auto time2 = time();
+    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
+              << std::endl;
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
@@ -42,14 +43,14 @@ int main() {
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
-    auto time3 = time();
+    auto time3 = paddle_mobile::time();
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
    DLOG << vec_result;
-    auto time4 = time();
+    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10
-              << std::endl;
+              << "ms" << std::endl;
  }
  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = paddle_mobile::time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+  }
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = paddle_mobile::time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
+  if (isok) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    GetInput<float>(g_yolo_img, &input, dims);
+    std::vector<float> vec_result;
+    //            = paddle_mobile.Predict(input, dims);
+    auto time3 = paddle_mobile::time();
+    int max = 10;
+    for (int i = 0; i < max; ++i) {
+      vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = paddle_mobile::time();
+    //    auto time3 = paddle_mobile::time();
+    //    for (int i = 0; i < 10; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+    //    auto time4 = paddle_mobile::time();
+    std::cout << "predict cost :"
+              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+              << std::endl;
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //        for (float i : vec_result) {
+    //            std::cout << i << std::endl;
+    //        }
+  }
+  return 0;
+}
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -125,7 +125,7 @@ template class TestBatchNormOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BatchNormOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  /// input x (4,10,2,2)

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -114,7 +114,7 @@ template class TestBoxCoderOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BoxCoderOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  paddle_mobile::framework::Tensor priorbox;

--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/concat_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_googlenet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/fusion_conv_add_relu_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_googlenet, true);

--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/conv_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::GPU_MALI> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_googlenet);

--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/depthwise_conv_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_mobilenet_ssd);

--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../test_include.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_resnet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -104,7 +104,7 @@ template class TestElementwiseSubOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run ElementwiseSub Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_ocr) + "/model",
                             std::string(g_ocr) + "/params");

--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
@@ -94,7 +94,7 @@ template class TestFillConstantOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run FillConstant Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_ocr) + "/model",
                             std::string(g_ocr) + "/params");

--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/fusion_conv_add_bn_relu_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_mobilenet, true);

--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -112,7 +112,7 @@ template class TestFcOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run Fc Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;

--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/gru_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_nlp);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -60,7 +60,6 @@ class TestIm2SequenceOp {
    Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);
    Variable *output = scope->Var("im2sequence_0.tmp_0");
    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({2, 12});
@@ -100,7 +99,7 @@ template class TestIm2SequenceOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run Im2Sequence Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_eng) + "/model",
                             std::string(g_eng) + "/params");

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/lrn_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_googlenet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -111,9 +111,8 @@ template class TestMultiClassNMSOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run MulticlassNMS Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  paddle_mobile::framework::Tensor inputx1;
  SetupTensor<float>(&inputx1, {1, 2, 4}, static_cast<float>(0),
                     static_cast<float>(1));

--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
@@ -96,7 +96,7 @@ template class TestPolygonBoxTransformOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run PolygonBoxTransform Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_ocr));
  paddle_mobile::framework::Tensor input;

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/pool_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_googlenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";

--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/prelu_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_resnet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -125,7 +125,7 @@ template class TestPriorBoxOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run PriorBoxOp Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  /// input x (1,3,300,300)

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/relu_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(g_resnet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
@@ -112,7 +112,7 @@ template class TestReshape2Op<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run Reshape2 Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_ocr) + "/model",
                             std::string(g_ocr) + "/params");

--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/reshape_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";

--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "operators/resize_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";

--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/executor.h"
+#include "framework/executor.h"
 int main() {
  paddle_mobile::framework::Tensor input;

--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "operators/softmax_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";

--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
@@ -103,7 +103,7 @@ template class TestSumOp<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run Sum Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_eng) + "/model",
                             std::string(g_eng) + "/params");

--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp
@@ -113,7 +113,7 @@ template class TestTranspose2Op<CPU>;
 int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run Transpose2 Test";
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_ocr) + "/model",
                             std::string(g_ocr) + "/params");

--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "../test_include.h"
 #include "operators/transpose_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));
  if (program.originProgram == nullptr) {
    DLOG << "program read file";

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -36,6 +36,7 @@ static const char *g_squeezenet = "../models/squeezenet";
 static const char *g_googlenet = "../models/googlenet";
 static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
 static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_mobilenet_mul = "../models/mobilenet_mul";
 static const char *g_alexnet = "../models/alexnet";
 static const char *g_inceptionv4 = "../models/inceptionv4";
 static const char *g_nlp = "../models/nlp";
@@ -44,8 +45,8 @@ static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
 static const char *g_yolo_combined = "../models/yolo_combined";
+static const char *g_yolo_mul = "../models/yolo_mul";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
 static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
 static const char *g_test_image_1x3x224x224_banana =
@@ -57,9 +58,12 @@ static const char *g_moto = "../images/moto_300x300_float";
 static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";
+static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
+static const char *g_mobilenet_img = "../images/image";
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
+using namespace paddle_mobile;
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,

--- a/third_party/opencl/OpenCL-Headers/CL/cl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl.h
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+#ifdef __APPLE__
+#include <OpenCL/cl_version.h>
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_version.h>
+#include <CL/cl_platform.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/******************************************************************************/
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_device_svm_capabilities;
+#endif
+typedef cl_bitfield         cl_command_queue_properties;
+#ifdef CL_VERSION_1_2
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+#endif
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_queue_properties;
+#endif
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_svm_mem_flags;
+#endif
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+#ifdef CL_VERSION_1_2
+typedef cl_bitfield         cl_mem_migration_flags;
+#endif
+typedef cl_uint             cl_image_info;
+#ifdef CL_VERSION_1_1
+typedef cl_uint             cl_buffer_create_type;
+#endif
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+#ifdef CL_VERSION_2_0
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+#endif
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_program_binary_type;
+#endif
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+#ifdef CL_VERSION_1_2
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+#endif
+typedef cl_uint             cl_kernel_work_group_info;
+#ifdef CL_VERSION_2_1
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+#ifdef CL_VERSION_2_0
+typedef cl_bitfield         cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+#endif
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+#ifdef CL_VERSION_1_2
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
+} cl_image_desc;
+#endif
+#ifdef CL_VERSION_1_1
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+#endif
+/******************************************************************************/
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#ifdef CL_VERSION_1_1
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#endif
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#ifdef CL_VERSION_1_1
+#define CL_INVALID_PROPERTY                         -64
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+#endif
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#ifdef CL_VERSION_1_2
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+#endif
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#ifdef CL_VERSION_2_1
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#endif
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#endif
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#endif
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#ifdef CL_VERSION_1_1
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#endif
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#ifdef CL_VERSION_1_1
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+#endif
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+#endif
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#ifdef CL_VERSION_1_1
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#endif
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#ifdef CL_VERSION_1_2
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+#endif
+#ifdef CL_VERSION_2_0
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+#endif
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#ifdef CL_VERSION_2_0
+#define CL_QUEUE_SIZE                               0x1094
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#endif
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#ifdef CL_VERSION_1_2
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+#endif
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#ifdef CL_VERSION_1_1
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+#endif
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#ifdef CL_VERSION_1_2
+#define CL_UNORM_INT24                              0x10DF
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_UNORM_INT_101010_2                       0x10E0
+#endif
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#ifdef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+#endif
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#ifdef CL_VERSION_1_1
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#endif
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#ifdef CL_VERSION_1_2
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+#endif
+#ifdef CL_VERSION_2_0
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+#endif
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#ifdef CL_VERSION_1_1
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+#endif
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+#ifdef CL_VERSION_2_0
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#endif
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#ifdef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#endif
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_PROGRAM_IL                               0x1169
+#endif
+#ifdef CL_VERSION_2_2
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+#endif
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#ifdef CL_VERSION_1_2
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+#endif
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#endif
+#ifdef CL_VERSION_2_1
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+#endif
+#ifdef CL_VERSION_1_2
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#ifdef CL_VERSION_2_0
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+#endif
+#endif
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#ifdef CL_VERSION_1_2
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+#endif
+#ifdef CL_VERSION_2_1
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+#endif
+#ifdef CL_VERSION_2_0
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+#endif
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#ifdef CL_VERSION_1_1
+#define CL_EVENT_CONTEXT                            0x11D4
+#endif
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#ifdef CL_VERSION_1_1
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#endif
+#ifdef CL_VERSION_1_2
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#endif
+#ifdef CL_VERSION_2_0
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#endif
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+#ifdef CL_VERSION_1_1
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+#endif
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#ifdef CL_VERSION_2_0
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+#endif
+/********************************************************************************************************/
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   /* platform */,
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */,
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */,
+               cl_uint          /* num_entries */,
+               cl_device_id *   /* devices */,
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_VERSION_2_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           /* context */,
+                               cl_device_id         /* device */,
+                               cl_command_queue     /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    /* device */,
+                        cl_ulong*       /* device_timestamp */,
+                        cl_ulong*       /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id /* device */,
+               cl_ulong *   /* host_timestamp */)  CL_API_SUFFIX__VERSION_2_1;
+#endif
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */,
+                 cl_context_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Command Queue APIs */
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               /* context */,
+                                   cl_device_id             /* device */,
+                                   const cl_queue_properties *    /* properties */,
+                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */,
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 /* context */,
+             cl_mem_flags               /* flags */,
+             cl_uint                    /* pipe_packet_size */,
+             cl_uint                    /* pipe_max_packets */,
+             const cl_pipe_properties * /* properties */,
+             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */,
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           /* pipe */,
+              cl_pipe_info     /* param_name */,
+              size_t           /* param_value_size */,
+              void *           /* param_value */,
+              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem /* memobj */,
+                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
+#endif
+/* SVM Allocation APIs */
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       /* context */,
+           cl_svm_mem_flags /* flags */,
+           size_t           /* size */,
+           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        /* context */,
+          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+/* Sampler APIs */
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     /* context */,
+                              const cl_sampler_properties *  /* normalized_coords */,
+                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_VERSION_2_1
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    /* context */,
+                     const void*    /* il */,
+                     size_t         /* length */,
+                     cl_int*        /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */,
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */,
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */,
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_VERSION_2_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          /* program */,
+                            void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                            void *              /* user_data */) CL_API_SUFFIX__VERSION_2_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  /* program */,
+                                   cl_uint     /* spec_id */,
+                                   size_t      /* spec_size */,
+                                   const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
+#endif
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_2_1
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     /* source_kernel */,
+              cl_int*       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
+                         cl_uint      /* arg_index */,
+                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            /* kernel */,
+                    cl_kernel_exec_info  /* param_name */,
+                    size_t               /* param_value_size */,
+                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_2_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   /* kernel */,
+                        cl_device_id                /* device */,
+                        cl_kernel_sub_group_info    /* param_name */,
+                        size_t                      /* input_value_size */,
+                        const void*                 /*input_value */,
+                        size_t                      /* param_value_size */,
+                        void*                       /* param_value */,
+                        size_t*                     /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1;
+#endif
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */,
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */,
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
+                     cl_mem             /* buffer */,
+                     cl_bool            /* blocking_write */,
+                     size_t             /* offset */,
+                     size_t             /* size */,
+                     const void *       /* ptr */,
+                     cl_uint            /* num_events_in_wait_list */,
+                     const cl_event *   /* event_wait_list */,
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */,
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */,
+                    const void *       /* pattern */,
+                    size_t             /* pattern_size */,
+                    size_t             /* offset */,
+                    size_t             /* size */,
+                    cl_uint            /* num_events_in_wait_list */,
+                    const cl_event *   /* event_wait_list */,
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */,
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */,
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */,
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */,
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */,
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */,
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */,
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */,
+                   const void *       /* fill_color */,
+                   const size_t *     /* origin[3] */,
+                   const size_t *     /* region[3] */,
+                   cl_uint            /* num_events_in_wait_list */,
+                   const cl_event *   /* event_wait_list */,
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */,
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */,
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */,
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */,
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */,
+                  cl_bool           /* blocking_map */,
+                  cl_map_flags      /* map_flags */,
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+                      void (CL_CALLBACK * /*user_func*/)(void *),
+                      void *            /* args */,
+                      size_t            /* cb_args */,
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_VERSION_2_0
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  /* command_queue */,
+                 cl_uint           /* num_svm_pointers */,
+                 void *[]          /* svm_pointers[] */,
+                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                        cl_uint          /* num_svm_pointers */,
+                                                        void *[]         /* svm_pointers[] */,
+                                                        void *           /* user_data */),
+                 void *            /* user_data */,
+                 cl_uint           /* num_events_in_wait_list */,
+                 const cl_event *  /* event_wait_list */,
+                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_copy */,
+                   void *            /* dst_ptr */,
+                   const void *      /* src_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
+                    void *            /* svm_ptr */,
+                    const void *      /* pattern */,
+                    size_t            /* pattern_size */,
+                    size_t            /* size */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  /* command_queue */,
+                cl_bool           /* blocking_map */,
+                cl_map_flags      /* flags */,
+                void *            /* svm_ptr */,
+                size_t            /* size */,
+                cl_uint           /* num_events_in_wait_list */,
+                const cl_event *  /* event_wait_list */,
+                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
+                  void *            /* svm_ptr */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
+#endif
+#ifdef CL_VERSION_2_1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         /* command_queue */,
+                       cl_uint                  /* num_svm_pointers */,
+                       const void **            /* svm_pointers */,
+                       const size_t *           /* sizes */,
+                       cl_mem_migration_flags   /* flags */,
+                       cl_uint                  /* num_events_in_wait_list */,
+                       const cl_event *         /* event_wait_list */,
+                       cl_event *               /* event */) CL_API_SUFFIX__VERSION_2_1;
+#endif
+#ifdef CL_VERSION_1_2
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
+                              cl_command_queue_properties   /* properties */,
+                              cl_bool                        /* enable */,
+                              cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */,
+                size_t                  /* image_row_pitch */,
+                size_t                  /* image_slice_pitch */,
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+/* Deprecated OpenCL 2.0 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */,
+                     cl_device_id                   /* device */,
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */,
+                cl_addressing_mode  /* addressing_mode */,
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+/******************************************************************************/
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+/******************************************************************************/
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_D3D10_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+/******************************************************************************/
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+/******************************************************************************/
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_D3D11_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+/******************************************************************************/
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+/******************************************************************************/
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+File Name: cl_dx9_media_sharing_intel.h
+Abstract:
+Notes:
+\*****************************************************************************/
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <d3d9.h>
+#include <dxvahd.h>
+#include <wtypes.h>
+#include <d3d9types.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/***************************************
+* cl_intel_dx9_media_sharing extension *
+****************************************/
+#define cl_intel_dx9_media_sharing 1
+typedef cl_uint cl_dx9_device_source_intel;
+typedef cl_uint cl_dx9_device_set_intel;
+/* error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                          0x4022
+#define CL_D3D9EX_DEVICE_INTEL                        0x4070
+#define CL_DXVA_DEVICE_INTEL                          0x4071
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
+/******************************************************************************/
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id              /* platform */,
+    cl_dx9_device_source_intel  /* dx9_device_source */,
+    void*                       /* dx9_object */,
+    cl_dx9_device_set_intel     /* dx9_device_set */,
+    cl_uint                     /* num_entries */, 
+    cl_device_id*               /* devices */, 
+    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context                  /* context */,
+    cl_mem_flags                /* flags */,
+    IDirect3DSurface9*          /* resource */,
+    HANDLE                      /* sharedHandle */,
+    UINT                        /* plane */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    const cl_mem*               /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue            /* command_queue */,
+    cl_uint                     /* num_objects */,
+    cl_mem*                     /* mem_objects */,
+    cl_uint                     /* num_events_in_wait_list */,
+    const cl_event*             /* event_wait_list */,
+    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+#ifdef __APPLE__
+#else
+#include <CL/cl.h>
+#endif  
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+#define cl_khr_egl_image 1
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+#define cl_khr_egl_event 1
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+#ifdef __cplusplus
+}
+#endif
+#endif /* __OPENCL_CL_EGL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
+/*******************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl.h>
+#endif
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+#if CL_TARGET_OPENCL_VERSION <= 110
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+#endif
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */,
+                                            const void * /* private_info */,
+                                            size_t       /* cb */,
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */,
+                                          const void * /* private_info */,
+                                          size_t       /* cb */,
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+extern CL_API_ENTRY cl_program
+  CL_API_CALL clCreateProgramWithILKHR(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */);
+typedef CL_API_ENTRY cl_program
+  (CL_API_CALL *clCreateProgramWithILKHR_fn)(
+      cl_context /* context */,
+      const void * /* il */,
+      size_t /* length */,
+      cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+typedef cl_bitfield cl_queue_properties_khr;
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
+                                       cl_device_id /* device */,
+                                       const cl_queue_properties_khr* /* properties */,
+                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
+                                                         cl_device_id /* device */,
+                                                         const cl_queue_properties_khr* /* properties */,
+                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                        const cl_device_partition_property_ext * /* properties */,
+                        cl_uint /*num_entries*/,
+                        cl_device_id * /*out_devices*/,
+                        cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+typedef CL_API_ENTRY cl_int
+( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                            const cl_device_partition_property_ext * /* properties */,
+                                            cl_uint /*num_entries*/,
+                                            cl_device_id * /*out_devices*/,
+                                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+typedef cl_bitfield cl_mem_migration_flags_ext;
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */,
+                              cl_uint /* num_mem_objects */,
+                              const cl_mem * /* mem_objects */,
+                              cl_mem_migration_flags_ext /* flags */,
+                              cl_uint /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event * /* event */ );
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */,
+                                                cl_uint /* num_mem_objects */,
+                                                const cl_mem * /* mem_objects */,
+                                                cl_mem_migration_flags_ext /* flags */,
+                                                cl_uint /* num_events_in_wait_list */,
+                                                const cl_event * /* event_wait_list */,
+                                                cl_event * /* event */ );
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+} cl_mem_ext_host_ptr;
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+    /* ION file descriptor */
+    int                  ion_filedesc;
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+} cl_mem_ion_host_ptr;
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+} cl_mem_android_native_buffer_host_ptr;
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+/* Flag values used by clCreteBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
+                                  cl_uint               /* num_objects */,
+                                  const cl_mem *        /* mem_objects */,
+                                  cl_uint               /* num_events_in_wait_list */,
+                                  const cl_event *      /* event_wait_list */,
+                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
+                           cl_device_id /*in_device*/,
+                           cl_kernel_sub_group_info /* param_name */,
+                           size_t /*input_value_size*/,
+                           const void * /*input_value*/,
+                           size_t /*param_value_size*/,
+                           void* /*param_value*/,
+                           size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
+                              cl_device_id /*in_device*/,
+                              cl_kernel_sub_group_info /* param_name */,
+                              size_t /*input_value_size*/,
+                              const void * /*input_value*/,
+                              size_t /*param_value_size*/,
+                              void* /*param_value*/,
+                              size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+typedef cl_uint  cl_queue_priority_khr;
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+typedef cl_uint  cl_queue_throttle_khr;
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+typedef intptr_t cl_import_properties_arm;
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       /* context */,
+              cl_svm_mem_flags_arm /* flags */,
+              size_t           /* size */,
+              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        /* context */,
+             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
+                    cl_uint           /* num_svm_pointers */,
+                    void *[]          /* svm_pointers[] */,
+                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+                                                           cl_uint          /* num_svm_pointers */,
+                                                           void *[]         /* svm_pointers[] */,
+                                                           void *           /* user_data */),
+                    void *            /* user_data */,
+                    cl_uint           /* num_events_in_wait_list */,
+                    const cl_event *  /* event_wait_list */,
+                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
+                      cl_bool           /* blocking_copy */,
+                      void *            /* dst_ptr */,
+                      const void *      /* src_ptr */,
+                      size_t            /* size */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
+                       void *            /* svm_ptr */,
+                       const void *      /* pattern */,
+                       size_t            /* pattern_size */,
+                       size_t            /* size */,
+                       cl_uint           /* num_events_in_wait_list */,
+                       const cl_event *  /* event_wait_list */,
+                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
+                   cl_bool           /* blocking_map */,
+                   cl_map_flags      /* flags */,
+                   void *            /* svm_ptr */,
+                   size_t            /* size */,
+                   cl_uint           /* num_events_in_wait_list */,
+                   const cl_event *  /* event_wait_list */,
+                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
+                     void *            /* svm_ptr */,
+                     cl_uint           /* num_events_in_wait_list */,
+                     const cl_event *  /* event_wait_list */,
+                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
+                            cl_uint      /* arg_index */,
+                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            /* kernel */,
+                       cl_kernel_exec_info_arm  /* param_name */,
+                       size_t               /* param_value_size */,
+                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CL_EXT_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
+/*******************************************************************************
+ * Copyright (c) 2008-2017 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/*****************************************************************************\
+Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+File Name: cl_ext_intel.h
+Abstract:
+Notes:
+\*****************************************************************************/
+#ifndef __CL_EXT_INTEL_H
+#define __CL_EXT_INTEL_H
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <OpenCL/cl_platform.h>
+#else
+    #include <CL/cl.h>
+    #include <CL/cl_platform.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/***************************************
+* cl_intel_thread_local_exec extension *
+****************************************/
+#define cl_intel_thread_local_exec 1
+#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
+/***********************************************
+* cl_intel_device_partition_by_names extension *
+************************************************/
+#define cl_intel_device_partition_by_names 1
+#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
+#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
+/************************************************
+* cl_intel_accelerator extension                *
+* cl_intel_motion_estimation extension          *
+* cl_intel_advanced_motion_estimation extension *
+*************************************************/
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+#define cl_intel_advanced_motion_estimation 1
+typedef struct _cl_accelerator_intel* cl_accelerator_intel;
+typedef cl_uint cl_accelerator_type_intel;
+typedef cl_uint cl_accelerator_info_intel;
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+/* error codes */
+#define CL_INVALID_ACCELERATOR_INTEL                              -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
+/* cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
+#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
+#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
+#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
+#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
+#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
+#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
+#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
+#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
+#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
+#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
+#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
+#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
+#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
+#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
+#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
+#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
+#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
+#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
+#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
+#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
+#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
+#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
+#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
+#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
+#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
+#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
+#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
+#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
+/* cl_device_info */
+#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
+#define CL_ME_VERSION_LEGACY_INTEL                                0x0
+#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
+#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
+/******************************************
+* cl_intel_simultaneous_sharing extension *
+*******************************************/
+#define cl_intel_simultaneous_sharing 1
+#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
+#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
+/***********************************
+* cl_intel_egl_image_yuv extension *
+************************************/
+#define cl_intel_egl_image_yuv 1
+#define CL_EGL_YUV_PLANE_INTEL                           0x4107
+/********************************
+* cl_intel_packed_yuv extension *
+*********************************/
+#define cl_intel_packed_yuv 1
+#define CL_YUYV_INTEL                                    0x4076
+#define CL_UYVY_INTEL                                    0x4077
+#define CL_YVYU_INTEL                                    0x4078
+#define CL_VYUY_INTEL                                    0x4079
+/********************************************
+* cl_intel_required_subgroup_size extension *
+*********************************************/
+#define cl_intel_required_subgroup_size 1
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
+#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
+#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
+/****************************************
+* cl_intel_driver_diagnostics extension *
+*****************************************/
+#define cl_intel_driver_diagnostics 1
+typedef cl_uint cl_diagnostics_verbose_level;
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
+/********************************
+* cl_intel_planar_yuv extension *
+*********************************/
+#define CL_NV12_INTEL                                       0x410E
+#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
+#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
+#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
+#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
+/*******************************************************
+* cl_intel_device_side_avc_motion_estimation extension *
+********************************************************/
+#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
+#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
+#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
+#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
+#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
+#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
+#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
+#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
+#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
+#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
+#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
+#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
+#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
+#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
+#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
+#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
+#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
+#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
+#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
+#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
+#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
+#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
+#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
+#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
+#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
+#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
+#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
+#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
+#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
+#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
+#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
+#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
+#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
+#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
+#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
+#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
+#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
+#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
+#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
+#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
+#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
+#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
+#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
+#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
+#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
+#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
+#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
+#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
+#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
+#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
+#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
+#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
+#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
+#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
+#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
+#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
+#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
+#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
+#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
+#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
+#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
+#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
+#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
+#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
+#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
+#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
+#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
+#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
+#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
+#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
+#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CL_EXT_INTEL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#ifdef CL_VERSION_1_2
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+#endif
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#ifdef CL_VERSION_1_2
+#define CL_GL_NUM_SAMPLES                       0x2012
+#endif
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+#ifdef CL_VERSION_1_2
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+#endif
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+/* cl_khr_gl_sharing extension  */
+#define cl_khr_gl_sharing 1
+typedef cl_uint     cl_gl_context_info;
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_GL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
+/**********************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+#ifdef __cplusplus
+}
+#endif
+#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
+/**********************************************************************************
+ * Copyright (c) 2008-2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+#ifdef __APPLE__
+    #include <OpenCL/cl_version.h>
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl_version.h>
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+         #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #endif
+    #elif defined(_WIN32)
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated)
+        #endif
+        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+        #else
+            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated)
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #endif
+#endif
+#if (defined (_WIN32) && defined(_MSC_VER))
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#else
+#include <stdint.h>
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#endif
+#include <stddef.h>
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+/* Define cl_vector types */
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+/* --- cl_floatn ---- */
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+/* --- cl_doublen ---- */
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+#ifdef __cplusplus
+}
+#endif
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+#endif  /* __CL_PLATFORM_H  */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
+/**********************************************************************************
+ * Copyright (c) 2008-2016 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+/*****************************************************************************\
+Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
+THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
+MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+File Name: cl_va_api_media_sharing_intel.h
+Abstract:
+Notes:
+\*****************************************************************************/
+#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#include <va/va.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+/******************************************
+* cl_intel_va_api_media_sharing extension *
+*******************************************/
+#define cl_intel_va_api_media_sharing 1
+/* error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+typedef cl_uint cl_va_api_device_source_intel;
+typedef cl_uint cl_va_api_device_set_intel;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id                /* platform */,
+    cl_va_api_device_source_intel /* media_adapter_type */,
+    void*                         /* media_adapter */,
+    cl_va_api_device_set_intel    /* media_adapter_set */,
+    cl_uint                       /* num_entries */,
+    cl_device_id*                 /* devices */,
+    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context                    /* context */,
+    cl_mem_flags                  /* flags */,
+    VASurfaceID*                  /* surface */,
+    cl_uint                       /* plane */,
+    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue              /* command_queue */,
+    cl_uint                       /* num_objects */,
+    const cl_mem*                 /* mem_objects */,
+    cl_uint                       /* num_events_in_wait_list */,
+    const cl_event*               /* event_wait_list */,
+    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
--- a/third_party/opencl/OpenCL-Headers/CL/cl_version.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_version.h
+/*******************************************************************************
+ * Copyright (c) 2018 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 220
+#endif
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+#endif  /* __CL_VERSION_H */
--- a/third_party/opencl/OpenCL-Headers/CL/opencl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/opencl.h
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_H
+#define __OPENCL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+#endif
+#ifdef __cplusplus
+}
+#endif
+#endif  /* __OPENCL_H   */
--- a/third_party/opencl/OpenCL-Headers/LICENSE
+++ b/third_party/opencl/OpenCL-Headers/LICENSE
+Copyright (c) 2008-2015 The Khronos Group Inc.
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and/or associated documentation files (the
+"Materials"), to deal in the Materials without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Materials, and to
+permit persons to whom the Materials are furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Materials.
+MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+   https://www.khronos.org/registry/
+THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
--- a/third_party/opencl/OpenCL-Headers/README.md
+++ b/third_party/opencl/OpenCL-Headers/README.md
+# OpenCL<sup>TM</sup> API Headers
+This repository contains C language headers for the OpenCL API.
+The authoritative public repository for these headers is located at:
+https://github.com/KhronosGroup/OpenCL-Headers
+Issues, proposed fixes for issues, and other suggested changes should be
+created using Github.
+## Branch Structure
+The OpenCL API headers in this repository are Unified headers and are designed
+to work with all released OpenCL versions.  This differs from previous OpenCL
+API headers, where version-specific API headers either existed in separate
+branches, or in separate folders in a branch.
+## Compiling for a Specific OpenCL Version
+By default, the OpenCL API headers in this repository are for the latest
+OpenCL version (currently OpenCL 2.2).  To use these API headers to target
+a different OpenCL version, an application may `#define` the preprocessor
+value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
+The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
+the OpenCL API version.
+For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
+include the OpenCL API headers as follows:
+```
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/opencl.h>
+```
+## Directory Structure
+```
+README.md               This file
+LICENSE                 Source license for the OpenCL API headers
+CL/                     Unified OpenCL API headers tree
+```
+## License
+See [LICENSE](LICENSE).
+---
+OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
@@ -5,12 +5,12 @@ MODELS_PATH="../../test/models/*"
 MODELS_SRC="../../test/models"
 IMAGE_PATH="../../test/images/*"
 EXE_FILE="../../test/build/*"
-EXE_DIR="data/local/tmp/bin"
+EXE_DIR="/data/local/tmp/bin"
 adb shell mkdir ${EXE_DIR}
-MODELS_DIR="data/local/tmp/models"
+MODELS_DIR="/data/local/tmp/models"
 adb shell mkdir ${MODELS_DIR}
 for file in `ls ${MODELS_SRC}`
-do 
+do
    adb shell mkdir ${MODELS_DIR}"/"${file}
 done
@@ -19,11 +19,15 @@ ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
 adb push ${ACL_BUILD_PATH} ${EXE_DIR}
 fi
-IMAGES_DIR="data/local/tmp/images"
+IMAGES_DIR="/data/local/tmp/images"
 adb shell mkdir ${IMAGES_DIR}
 LIB_PATH="../../build/release/arm-v7a/build/*"
 adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
+for file in ${LIB_PATH}
+do
+    adb push ${file} ${EXE_DIR}
+done
 if [[ $1 != "npm" ]]; then
 adb push ${IMAGE_PATH} ${IMAGES_DIR}
 adb push ${MODELS_PATH} ${MODELS_DIR}

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -90,6 +90,8 @@ build_for_android() {
    fi
    cd "../build/release/${PLATFORM}"
    make -j 8
+    mkdir ./build/cl_kernel
+    cp ../../../src/operators/kernel/cl/cl_kernel/*  ./build/cl_kernel/
 }

--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -17,7 +17,7 @@ shift
 perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
 (
 # remove clang format ios_io folder
-flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
 clang-format -i $flist
 )
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"