diff --git a/CMakeLists.txt b/CMakeLists.txt index bdbf5a6ea604400fb5087976df0e1e9c279fd78d..c90d4ec1b7699c718a6036cd09f6a3c15a068070 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,15 +1,16 @@ -cmake_minimum_required(VERSION 3.0) -project(paddle-mobile) - -# select the platform to build -option(CPU "armv7 with neon support" ON) -option(MALI_GPU "mali gpu support" OFF) -option(FPGA "fpga support" OFF) +cmake_minimum_required(VERSION 3.6) -option(USE_OPENMP "openmp support" OFF) +option(USE_OPENMP "openmp support" ON) option(DEBUGING "enable debug mode" ON) -option(USE_EXCEPTION "use std exception" OFF) +option(USE_EXCEPTION "use std exception" ON) option(LOG_PROFILE "log profile" OFF) +# select the platform to build +option(CPU "armv7 with neon" ON) +option(GPU_MALI "mali gpu" OFF) +option(GPU_CL "opencl gpu" ON) +option(FPGA "fpga" OFF) + +project(paddle-mobile) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) @@ -70,7 +71,27 @@ else() endforeach() endif() -if(MALI_GPU) +if (GPU_CL) + add_definitions(-DPADDLE_MOBILE_CL) + + # opencl version + add_definitions(-DCL_TARGET_OPENCL_VERSION=220) + + link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so) + include_directories(third_party/opencl/OpenCL-Headers) +else() + file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + + file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h) + foreach(f ${_tmp_list_h}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) + endforeach() +endif() + +if (GPU_MALI) add_definitions(-DPADDLE_MOBILE_MALI_GPU) add_definitions(-DUSE_ACL=1) add_definitions(-DUSE_OPENCL) @@ -124,17 +145,17 @@ endif() if(ANDROID_NDK_TOOLCHAIN_INCLUDED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) endif() if(IS_IOS) else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h) -endif() + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h) +endif () set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/src/common/common.h b/src/common/common.h index 12157b5e946490d041f0cc0d235142a13a3a2527..c7a681f426f788bcd8ee8f52dbfab3c6e1afeb8f 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include +#include // NOLINT + +namespace paddle_mobile { using Time = decltype(std::chrono::high_resolution_clock::now()); @@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) { ms counter = std::chrono::duration_cast(diff); return counter.count() / 1000.0; } + +} // namespace paddle_mobile diff --git a/src/common/enforce.h b/src/common/enforce.h index aebe2a58031cb1341596f07dbf653be4a5e01900..bf21b5b9a2fe5f70b3bd23a581f0c1dfbf373f42 100644 --- a/src/common/enforce.h +++ b/src/common/enforce.h @@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception { std::string detail(buffer); \ throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \ __FILE__, __LINE__); \ - } + } \ + exit(0); #define PADDLE_MOBILE_ENFORCE(stat, ...) \ { \ diff --git a/src/common/types.h b/src/common/types.h index 0855bd053f0dc804b6f3289796f3818657675864..70f6debf8756211cf49a62d074010e2bc6d4eaa7 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -39,7 +39,13 @@ struct PrecisionTrait { }; //! device type -enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; +enum DeviceTypeEnum { + kINVALID = -1, + kCPU = 0, + kFPGA = 1, + kGPU_MALI = 2, + kGPU_CL = 3 +}; template struct DeviceType {}; @@ -47,6 +53,7 @@ struct DeviceType {}; typedef DeviceType CPU; typedef DeviceType FPGA; typedef DeviceType GPU_MALI; +typedef DeviceType GPU_CL; //! data type enum DataType { diff --git a/src/framework/attribute.h b/src/framework/attribute.h index a94346bc7ab321b0f5710a98fb3cc60198f148b0..a21e0a4ec321dbfe08f87160cc2f0c159594920d 100644 --- a/src/framework/attribute.h +++ b/src/framework/attribute.h @@ -117,9 +117,9 @@ class Attribute { template static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) { - if (attr.variant_.TypeId() == typeid(int).hash_code()) { + if (attr.variant_.TypeId() == typeid(int).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == typeid(float).hash_code()) { + } else if (attr.variant_.TypeId() == typeid(float).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); } else if (attr.variant_.TypeId() == typeid(string).hash_code()) { return vistor(attr.variant_.GetString()); @@ -129,7 +129,7 @@ class Attribute { return vistor(attr.variant_.Get>()); } else if (attr.variant_.TypeId() == typeid(vector).hash_code()) { return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { + } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); } else if (attr.variant_.TypeId() == typeid(vector).hash_code()) { return vistor(attr.variant_.Get>()); @@ -137,7 +137,6 @@ class Attribute { return vistor(attr.variant_.Get()); } else { PADDLE_MOBILE_THROW_EXCEPTION("type not support"); - exit(0); } } diff --git a/src/framework/cl/cl_deleter.h b/src/framework/cl/cl_deleter.h new file mode 100644 index 0000000000000000000000000000000000000000..55af631174ae9f2a7815c2da35ebadda3ebfd9e9 --- /dev/null +++ b/src/framework/cl/cl_deleter.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CL/cl.h" + +struct CLKernelDeleter { + template + void operator()(T *clKernelObj) { + clReleaseKernel(clKernelObj); + } +}; + +struct CLMemDeleter { + template + void operator()(T *clMemObj) { + clReleaseMemObject(clMemObj); + } +}; + +struct CLEventDeleter { + template + void operator()(T *clEventObj) { + clReleaseEvent(clEventObj); + } +}; + +struct CLCommQueueDeleter { + template + void operator()(T *clQueueObj) { + clReleaseCommandQueue(clQueueObj); + } +}; + +struct CLContextDeleter { + template + void operator()(T *clContextObj) { + clReleaseContext(clContextObj); + } +}; + +struct CLProgramDeleter { + template + void operator()(T *clProgramObj) { + clReleaseProgram(clProgramObj); + } +}; diff --git a/src/framework/cl/cl_engine.cpp b/src/framework/cl/cl_engine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04d1675227aac0967f8dee94aa7a27ae5ea73c0f --- /dev/null +++ b/src/framework/cl/cl_engine.cpp @@ -0,0 +1,131 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_engine.h" +#include "CL/cl.h" +#include "framework/cl/cl_tool.h" + +#include +#include + +namespace paddle_mobile { +namespace framework { + +bool CLEngine::Init() { + if (initialized_) { + return true; + } + cl_int status; + SetPlatform(); + SetClDeviceId(); + + initialized_ = true; + return initialized_; + // setClCommandQueue(); + // std::string filename = "./HelloWorld_Kernel.cl"; + // loadKernelFromFile(filename.c_str()); + // buildProgram(); +} + +CLEngine *CLEngine::Instance() { + static CLEngine cl_engine_; + cl_engine_.Init(); + return &cl_engine_; +} + +bool CLEngine::SetPlatform() { + platform_ = NULL; // the chosen platform + cl_uint numPlatforms; // the NO. of platforms + cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); + + /**For clarity, choose the first available platform. */ + if (numPlatforms > 0) { + cl_platform_id *platforms = reinterpret_cast( + malloc(numPlatforms * sizeof(cl_platform_id))); + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + platform_ = platforms[0]; + free(platforms); + return true; + } else { + return false; + } +} + +bool CLEngine::SetClDeviceId() { + cl_uint numDevices = 0; + devices_ = NULL; + cl_int status = + clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + + if (numDevices > 0) { + devices_ = reinterpret_cast( + malloc(numDevices * sizeof(cl_device_id))); + status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_, + NULL); + return true; + } + return false; +} + +// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel( +// const std::string &kernel_name) { +// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel( +// clCreateKernel(program_.get(), kernel_name.c_str(), NULL)); +// return std::move(kernel); +//} +// +// bool CLEngine::SetClCommandQueue() { +// cl_int status; +// command_queue_.reset( +// clCreateCommandQueue(context_.get(), devices_[0], 0, &status)); +// return true; +//} + +// bool CLEngine::SetClContext() { +// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL)); +// return true; +//} + +// bool CLEngine::LoadKernelFromFile(const char *kernel_file) { +// size_t size; +// char *str; +// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary)); +// +// if (!f.is_open()) { +// return false; +// } +// +// size_t fileSize; +// f.seekg(0, std::fstream::end); +// size = fileSize = (size_t)f.tellg(); +// f.seekg(0, std::fstream::beg); +// str = new char[size + 1]; +// if (!str) { +// f.close(); +// return 0; +// } +// +// f.read(str, fileSize); +// f.close(); +// str[size] = '\0'; +// const char *source = str; +// size_t sourceSize[] = {strlen(source)}; +// program_.reset( +// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize, +// NULL)); +// return true; +//} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..f9f373b2a74087960b03c55ec922f95f187cfbc4 --- /dev/null +++ b/src/framework/cl/cl_engine.h @@ -0,0 +1,144 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "CL/cl.h" +#include "common/enforce.h" +#include "common/log.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +class CLEngine { + public: + static CLEngine *Instance(); + + bool Init(); + + std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() { + cl_int status; + cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status); + std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c); + CL_CHECK_ERRORS(status); + return std::move(context_ptr); + } + + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( + cl_context context) { + cl_int status; + cl_command_queue queue = + clCreateCommandQueue(context, devices_[0], 0, &status); + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr( + queue); + CL_CHECK_ERRORS(status); + return std::move(command_queue_ptr); + } + + std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith( + cl_context context, std::string file_name) { + FILE *file = fopen(file_name.c_str(), "rb"); + PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", + file_name.c_str()); + fseek(file, 0, SEEK_END); + int64_t size = ftell(file); + PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); + rewind(file); + char *data = new char[size + 1]; + size_t bytes_read = fread(data, 1, size, file); + data[size] = '\0'; + PADDLE_MOBILE_ENFORCE(bytes_read == size, + "read binary file bytes do not match with fseek"); + fclose(file); + + const char *source = data; + size_t sourceSize[] = {strlen(source)}; + cl_program p = + clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); + + DLOG << " cl kernel file name: " << file_name; + DLOG << " source size: " << sourceSize[0]; + CL_CHECK_ERRORS(status_); + + std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); + + return std::move(program_ptr); + } + + std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) { + cl_event event = clCreateUserEvent(context, &status_); + std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event); + CL_CHECK_ERRORS(status_); + return std::move(event_ptr); + } + + bool BuildProgram(cl_program program) { + cl_int status; + status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", + 0, 0); + + CL_CHECK_ERRORS(status); + + if (status_ == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = reinterpret_cast(malloc(log_size)); + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + DLOG << " program build error: " << log; + } + + if (status == CL_SUCCESS) { + return true; + } else { + return false; + } + } + + cl_device_id DeviceID(int index = 0) { return devices_[index]; } + + private: + CLEngine() { initialized_ = false; } + + bool SetPlatform(); + + bool SetClDeviceId(); + + bool initialized_; + + cl_platform_id platform_; + + cl_device_id *devices_; + + cl_int status_; + + std::unique_ptr<_cl_program, CLProgramDeleter> program_; + + // bool SetClContext(); + + // bool SetClCommandQueue(); + + // bool LoadKernelFromFile(const char *kernel_file); + + // bool BuildProgram(); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_half.cpp b/src/framework/cl/cl_half.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2877289325d983d0c7d9756732254e0a4ed831b6 --- /dev/null +++ b/src/framework/cl/cl_half.cpp @@ -0,0 +1,518 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + +#include "framework/cl/cl_half.h" + +namespace paddle_mobile { +namespace framework { + +static const uint32_t mantissatable[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, + 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, + 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, + 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, + 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, + 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, + 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, + 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, + 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, + 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, + 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, + 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, + 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, + 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, + 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, + 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, + 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, + 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, + 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, + 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, + 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, + 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, + 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, + 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, + 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, + 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, + 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, + 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, + 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, + 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, + 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, + 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, + 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, + 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, + 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, + 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, + 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, + 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, + 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, + 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, + 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, + 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, + 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, + 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, + 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, + 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, + 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, + 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, + 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, + 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, + 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, + 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, + 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, + 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, + 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, + 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, + 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, + 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, + 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, + 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, + 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, + 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, + 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, + 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, + 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, + 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, + 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, + 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, + 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, + 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, + 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, + 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, + 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, + 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, + 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, + 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, + 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, + 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, + 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, + 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, + 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, + 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, + 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, + 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, + 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, + 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, + 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, + 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, + 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, + 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, + 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, + 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, + 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, + 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, + 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, + 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, + 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, + 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, + 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, + 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, + 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, + 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, + 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, + 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, + 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, + 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, + 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, + 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, + 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, + 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, + 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, + 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, + 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, + 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, + 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, + 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, + 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, + 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, + 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, + 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, + 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, + 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, + 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, + 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, + 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, + 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, + 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, + 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, + 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, + 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, + 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, + 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, + 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, + 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, + 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, + 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, + 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, + 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, + 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, + 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, + 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, + 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, + 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, + 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, + 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, + 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, + 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, + 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, + 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, + 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, + 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, + 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, + 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, + 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, + 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, + 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, + 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, + 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, + 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, + 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, + 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, + 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, + 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, + 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, + 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, + 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, + 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, + 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, + 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, + 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, + 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, + 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, + 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, + 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, + 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, + 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, + 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, + 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, + 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, + 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, + 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, + 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, + 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, + 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, + 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, + 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, + 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, + 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, + 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, + 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, + 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, + 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, + 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, + 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, + 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, + 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, + 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, + 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, + 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, + 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, + 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, + 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, + 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, + 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, + 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, + 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, + 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, + 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, + 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, + 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, + 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, + 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, + 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, + 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, + 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, + 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, + 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, + 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, + 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, + 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, + 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, + 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, + 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, + 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, + 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, + 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, + 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, + 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, + 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, + 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, + 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, + 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, + 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, + 0x387fc000, 0x387fe000}; + +static const uint16_t offsettable[64] = { + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; + +static const uint32_t exponenttable[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, + 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, + 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, + 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, + 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; + +static const uint16_t basetable[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, + 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, + 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, + 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, + 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, + 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; + +static const uint8_t shifttable[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, + 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; + +half_t Float2Half(float f) { + uint32_t v = *reinterpret_cast(&f); + return basetable[(v >> 23) & 0x1ff] + + ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); +} + +float Half2Float(half_t h) { + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + + exponenttable[h >> 10]; + return *reinterpret_cast(&v); +} + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { + for (int i = 0; i < count; ++i) { + h_array[i] = Float2Half(f_array[i]); + } +} + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { + for (int i = 0; i < count; ++i) { + f_array[i] = Half2Float(h_array[i]); + } +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_half.h b/src/framework/cl/cl_half.h new file mode 100644 index 0000000000000000000000000000000000000000..9b05740f1e19af66036a1562243102e5ba42ab1b --- /dev/null +++ b/src/framework/cl/cl_half.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle_mobile { +namespace framework { + +typedef uint16_t half_t; + +half_t Float2Half(float f); + +float Half2Float(half_t h); + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..bea91ee24ceb5e9011708bd277629a07beb4b8ef --- /dev/null +++ b/src/framework/cl/cl_helper.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "common/log.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_image.h" +#include "framework/cl/cl_scope.h" + +namespace paddle_mobile { +namespace framework { + +class CLHelper { + public: + CLHelper() = default; + + explicit CLHelper(CLScope *scope) : scope_(scope) {} + + void AddKernel(const std::string &kernel_name, const std::string &file_name) { + DLOG << " begin add kernel "; + auto kernel = scope_->GetKernel(kernel_name, file_name); + DLOG << " add kernel ing "; + kernels.emplace_back(std::move(kernel)); + } + + cl_kernel KernelAt(const int index) { + DLOG << " kernel count: " << kernels.size(); + return kernels[index].get(); + } + + cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); } + + cl_context CLContext() { return scope_->Context(); } + + std::vector DefaultWorkSize(const CLImage &image) { + // n c h w + auto image_dim = image.dims(); + if (image_dim.size() == 4) { + auto n = image_dim[0]; + auto h = image_dim[2]; + auto w = image_dim[3]; + auto image_width = image.ImageWidth(); + auto work_size_0 = image_width / w; + auto work_size_1 = w; + auto work_size_2 = n * h; + return {work_size_0, work_size_1, work_size_2}; + } else if (image_dim.size() == 2) { + return {1, image.ImageWidth(), image.ImageHeight()}; + } else if (image_dim.size() == 1) { + return {1, image.ImageWidth(), 1}; + } + PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); + } + + private: + CLScope *scope_; + std::vector> kernels; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5 --- /dev/null +++ b/src/framework/cl/cl_image.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_image.h" + +namespace paddle_mobile { +namespace framework { + +void CLImageToTensor(CLImage *cl_image, Tensor *tensor, + cl_command_queue commandQueue) { + // TODO(yangfei): need imp +} + +void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, + cl_command_queue commandQueue) { + // TODO(yangfei): need imp +} + +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &cl_image) { + int width = cl_image.ImageDims()[0]; + int height = cl_image.ImageDims()[1]; + + half_t *image_data = new half_t[height * width * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, image_data, 0, NULL, NULL); + + CL_CHECK_ERRORS(err); + + float *tensor_data = new float[cl_image.numel()]; + auto converter = cl_image.Converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), + cl_image.dims()); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + + printer << " dims: " << cl_image.dims() << "\n"; + for (int i = 0; i < cl_image.numel(); i += stride) { + printer << tensor_data[i] << " "; + } + + delete[](tensor_data); + delete[](image_data); + + return printer; +} +#endif +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h new file mode 100644 index 0000000000000000000000000000000000000000..35f60d3b773937d381447b23b64985ce543fddee --- /dev/null +++ b/src/framework/cl/cl_image.h @@ -0,0 +1,234 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "CL/cl.h" + +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/cl/cl_half.h" +#include "framework/cl/cl_image_converter.h" +#include "framework/cl/cl_tool.h" +#include "framework/ddim.h" +#include "framework/tensor.h" + +namespace paddle_mobile { +namespace framework { + +class CLImage { + public: + CLImage() = default; + + ~CLImage() { + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + } + + if (image_converter_) { + delete (image_converter_); + } + } + /* + * will not hold input tensor data, memcpy in this method + * */ + void SetTensorData(float *tensorData, const DDim &dim) { + int numel = product(dim); + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + tensor_data_ = nullptr; + } + tensor_data_ = new float[numel]; + memcpy(tensor_data_, tensorData, numel * sizeof(float)); + tensor_dims_ = dim; + } + + /* + * need call SetTensorData first + * + * folder when one dim or two dim + * */ + void InitCLImage(cl_context context, cl_command_queue command_queue) { + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + InitCLImage(context, command_queue, folder_converter); + } + + void InitCLImage(cl_context context, cl_command_queue command_queue, + CLImageConverterBase *converter) { + if (image_converter_ != nullptr) { + delete (image_converter_); + } + + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + + DLOG << " begin init cl image "; + image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); + + half_t *image_data = new half_t[product(image_dims_) * 4]; + + DLOG << " convert to image"; + converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); + DLOG << " end convert to image"; + + InitCLImage(context, image_dims_[0], image_dims_[1], image_data); + + delete[](image_data); + delete[](tensor_data_); + + command_queue_ = command_queue; + tensor_data_ = nullptr; + image_converter_ = converter; + initialized_ = true; + DLOG << " end init cl image"; + } + + void InitNImage(cl_context context, cl_command_queue command_queue) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock(); + InitCLImage(context, command_queue, folder_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); + } + void InitDWImage(cl_context context, cl_command_queue command_queue) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock(); + InitCLImage(context, command_queue, dw_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); + } + + void InitEmptyImage(cl_context context, cl_command_queue command_queue, + const DDim &dim) { + PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, + " empty image tensor data shouldn't have value"); + + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + + DLOG << " to get image dims "; + image_dims_ = folder_converter->InitImageDimInfoWith(dim); + DLOG << " end get image dims " << image_dims_; + + InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + + tensor_dims_ = dim; + command_queue_ = command_queue; + image_converter_ = folder_converter; + cl_event_ = CLEngine::Instance()->CreateEvent(context); + initialized_ = true; + DLOG << " end init cl image"; + } + + cl_mem GetCLImage() const { return cl_image_.get(); } + + const DDim &ImageDims() const { return image_dims_; } + + inline size_t ImageWidth() const { return image_dims_[0]; } + + inline size_t ImageHeight() const { return image_dims_[1]; } + + inline cl_command_queue CommandQueue() const { return command_queue_; } + + /* + * resize original tensor dim + * */ + inline CLImage &Resize(const DDim &dims) { + tensor_dims_ = dims; + return *this; + } + + template + T *data() const { + if (initialized_) { + PADDLE_MOBILE_THROW_EXCEPTION( + " cl image has initialized, tensor data has been deleted, can't use " + "tensor data"); + } + return reinterpret_cast(tensor_data_); + } + + /* + * numel of tensor dim + * */ + inline int64_t numel() const { return product(tensor_dims_); } + + /* + * original tensor dim + * */ + const DDim &dims() const { return tensor_dims_; } + + cl_event GetClEvent() const { return cl_event_.get(); } + + CLImageConverterBase *Converter() const { return image_converter_; } + + private: + void InitCLImage(cl_context context, int width, int height, void *data) { + cl_image_format cf = {.image_channel_order = CL_RGBA, + .image_channel_data_type = CL_HALF_FLOAT}; + cl_image_desc cid = { + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = width, + .image_height = height, + .image_depth = 1, + .image_array_size = 1, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + // .buffer = nullptr + }; + cid.buffer = nullptr; + cl_int err; + cl_mem cl_image = clCreateImage( + context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), + &cf, // const cl_image_format *image_format + &cid, // const cl_image_desc *image_desc + data, // void *host_ptr + &err); + cl_image_.reset(cl_image); + if (err != CL_SUCCESS) { + CL_CHECK_ERRORS(err); + PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); + } + } + + bool initialized_ = false; + std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_; + std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; + DDim tensor_dims_; + DDim image_dims_; + float *tensor_data_ = nullptr; + cl_context context_; + cl_command_queue command_queue_; + CLImageConverterBase *image_converter_ = nullptr; +}; + +void TensorToCLImage(Tensor *tensor, CLImage *image, + cl_command_queue commandQueue); + +void CLImageToTensor(CLImage *image, Tensor *tensor, + cl_command_queue commandQueue); + +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &image); +#endif + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..13094a8d05ac6f7f8d2451a3498da058b37ee98b --- /dev/null +++ b/src/framework/cl/cl_image_converter.cpp @@ -0,0 +1,393 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_image_converter.h" + +namespace paddle_mobile { +namespace framework { + +const DDim &CLImageConverterDefault::InitImageDimInfoWith( + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + return make_ddim({width, height}); +} + +void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = nchw; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + int width = image_dim[0]; + int height = image_dim[0]; + + float *p = tensor; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +const DDim &CLImageConverterFolder::InitImageDimInfoWith( + const DDim &tensor_dim) { + if (tensor_dim.size() <= 2) { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + int width = (tdim[1] + 3) / 4; + int height = tdim[0]; + + width_of_one_block_ = width; + height_of_one_block_ = height; + c_block_ = 1; + + return make_ddim({width, height}); + + } else { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return make_ddim({width, height}); + } +} + +void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, + "tensor dim is not support "); + + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); + + } else { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + + DDim image_dim = InitImageDimInfoWith(tensor_dim); + int width = image_dim[0]; + + for (int h = 0; h < tdim[0]; h++) { + for (int w = 0; w < tdim[1]; w++) { + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); + } + } + } +} + +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); + + } else { + int width = image_dim[0]; + int height = image_dim[1]; + int H, W; + + if (tensor_dim.size() == 2) { + H = tensor_dim[0]; + W = tensor_dim[1]; + } else if (tensor_dim.size() == 1) { + H = 1; + W = tensor_dim[0]; + } + float *p = tensor; + + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); + } + } + } +} + +const DDim &CLImageConverterNWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + auto image_dim = InitImageDimInfoWith(tensor_dim); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < block * 4; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + if (n < N) { + image[index] = Float2Half(*p); + p++; + } else { + image[index] = 0.0; + } + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + *p = Half2Float(image[index]); + p++; + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +const DDim &CLImageConverterDWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[1]; + C = new_dims[0]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = tensor; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[1]; + int C = tensor_dim[0]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[0]; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..02887b0cd468a45630122bb3f236c0775ac1eaa1 --- /dev/null +++ b/src/framework/cl/cl_image_converter.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/cl/cl_half.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace framework { + +class CLImageConverterBase { + public: + virtual void NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) = 0; + + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, + const DDim &tensor_dim) = 0; + virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0; +}; + +class CLImageConverterDefault : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +class CLImageConverterFolder : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + +class CLImageConverterNWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; +class CLImageConverterDWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h new file mode 100644 index 0000000000000000000000000000000000000000..0965b133e6d8270b7cd6e28c8ed9a33739b2e2a8 --- /dev/null +++ b/src/framework/cl/cl_scope.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "CL/cl.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +class CLScope { + public: + CLScope() { + CLEngine *engin = CLEngine::Instance(); + context_ = engin->CreateContext(); + command_queue_ = engin->CreateClCommandQueue(context_.get()); + } + + cl_command_queue CommandQueue() { return command_queue_.get(); } + + std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( + const std::string &kernel_name, const std::string &file_name) { + DLOG << " to get program " << file_name; + auto program = Program(file_name); + DLOG << " end get program ~ "; + DLOG << " to create kernel: " << kernel_name; + std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( + clCreateKernel(program, kernel_name.c_str(), &status_)); + CL_CHECK_ERRORS(status_); + DLOG << " end create kernel ~ "; + return std::move(kernel); + } + + cl_context Context() { return context_.get(); } + + cl_program Program(const std::string &file_name) { + auto it = programs_.find(file_name); + if (it != programs_.end()) { + return it->second.get(); + } + + auto program = CLEngine::Instance()->CreateProgramWith( + context_.get(), "./cl_kernel/" + file_name); + + DLOG << " --- begin build program -> " << file_name << " --- "; + CLEngine::Instance()->BuildProgram(program.get()); + DLOG << " --- end build program -> " << file_name << " --- "; + + programs_[file_name] = std::move(program); + + return programs_[file_name].get(); + } + + private: + cl_int status_; + std::unique_ptr<_cl_context, CLContextDeleter> context_; + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_; + std::unordered_map> + programs_; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..b853fa0e8d734c38de2fdc53f766d735dc72bb20 --- /dev/null +++ b/src/framework/cl/cl_tensor.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "CL/cl.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/tensor_base.h" + +namespace paddle_mobile { +namespace framework { + +class CLTensor : TensorBase { + public: + CLTensor(cl_context context, cl_command_queue command_queue) + : context_(context), command_queue_(command_queue) {} + + CLTensor() = default; + + /* + * if init method haven't set context and command_queue, need set + * */ + void SetContextAndCommandQueue(cl_context context, + cl_command_queue command_queue) { + context_ = context; + command_queue_ = command_queue; + } + + /*! Resize the dimensions of the memory block. */ + inline CLTensor &Resize(const DDim &dims) { + dims_ = dims; + return *this; + } + + template + inline cl_mem mutable_with_data(const T *data) { + int64_t size = numel() * sizeof(T); + + holder_.reset(new PlaceholderImpl( + size, reinterpret_cast(const_cast(data)), typeid(T), + context_, command_queue_)); + return reinterpret_cast(holder_->ptr()); + } + + inline cl_mem mutable_data(std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") + int64_t size = numel() * SizeOfType(type); + if (holder_ == nullptr || holder_->size() < size + offset_) { + holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_)); + offset_ = 0; + } + return reinterpret_cast(holder_->ptr()); + } + + /** + * @brief Return a pointer to cl buffer. + * @note If not exist, then allocation. + */ + template + inline cl_mem mutable_data() { + return reinterpret_cast(mutable_data(typeid(T))); + } + + /** + * @brief Return a pointer to cl buffer. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline cl_mem mutable_data(DDim dims) { + Resize(dims); + return mutable_data(); + } + + inline cl_mem CLBuffer() { + check_memory_size(); + return reinterpret_cast( + reinterpret_cast(holder_->ptr())); + } + + template + inline T *Data() { + if (host_ptr_) { + delete (host_ptr_); + host_ptr_ = nullptr; + } + cl_mem buffer = CLBuffer(); + host_ptr_ = new char[holder_->size()]; + cl_int status; + status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0, + holder_->size(), host_ptr_, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + return reinterpret_cast(host_ptr_); + } + + int memorySize() { return holder_->size(); } + + ~CLTensor() { + DLOG << "~CLTensor"; + if (host_ptr_) { + DLOG << " delete host ptr "; + delete (host_ptr_); + host_ptr_ = nullptr; + } + } + + private: + cl_context context_; + cl_command_queue command_queue_; + void *host_ptr_ = nullptr; + + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(size_t size, void *input, std::type_index type, + cl_context context, cl_command_queue command_queue) + : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + size, reinterpret_cast(input), NULL)), + size_(size), + type_(type), + command_queue_(command_queue) {} + + PlaceholderImpl(size_t size, std::type_index type, cl_context context, + cl_command_queue command_queue) + : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)), + size_(size), + type_(type), + command_queue_(command_queue) {} + + virtual size_t size() const { return size_; } + + virtual void *ptr() const { return static_cast(ptr_.get()); } + + virtual std::type_index type() const { return type_; } + + virtual void set_type(std::type_index type) { type_ = type; } + + std::unique_ptr<_cl_mem, CLMemDeleter> ptr_; + + size_t size_; + + /* the current type of memory */ + std::type_index type_; + + cl_command_queue command_queue_; + }; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tool.cpp b/src/framework/cl/cl_tool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..827642b6b73cfaee02f4053dce798bf6b3c52f4b --- /dev/null +++ b/src/framework/cl/cl_tool.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +const char *opencl_error_to_str(cl_int error) { +#define CASE_CL_CONSTANT(NAME) \ + case NAME: \ + return #NAME; + // Suppose that no combinations are possible. + switch (error) { + CASE_CL_CONSTANT(CL_SUCCESS) + CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) + CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) + CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) + CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) + CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) + CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) + CASE_CL_CONSTANT(CL_MAP_FAILURE) + CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) + CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_VALUE) + CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) + CASE_CL_CONSTANT(CL_INVALID_PLATFORM) + CASE_CL_CONSTANT(CL_INVALID_DEVICE) + CASE_CL_CONSTANT(CL_INVALID_CONTEXT) + CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) + CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) + CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) + CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) + CASE_CL_CONSTANT(CL_INVALID_SAMPLER) + CASE_CL_CONSTANT(CL_INVALID_BINARY) + CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) + CASE_CL_CONSTANT(CL_INVALID_KERNEL) + CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) + CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) + CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) + CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) + CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) + CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) + CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_EVENT) + CASE_CL_CONSTANT(CL_INVALID_OPERATION) + CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) + CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) + CASE_CL_CONSTANT(CL_INVALID_PROPERTY) + + default: + return "UNKNOWN ERROR CODE"; + } +#undef CASE_CL_CONSTANT +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tool.h b/src/framework/cl/cl_tool.h new file mode 100644 index 0000000000000000000000000000000000000000..25d5bfc584b59e4fe9d22a922b601f8c32892fd1 --- /dev/null +++ b/src/framework/cl/cl_tool.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CL/cl.h" + +namespace paddle_mobile { +namespace framework { + +const char* opencl_error_to_str(cl_int error); + +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "OpenCL error with code %s happened in file %s at line %d. " \ + "Exiting.\n", \ + paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ + __LINE__); \ + } + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h index 0ba31ef9b7016b453b34cc4a023b0841b2110540..665b5315bc1c0fca7b9e62f89062f375a9a011be 100644 --- a/src/framework/data_layout.h +++ b/src/framework/data_layout.h @@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) { return DataLayout::kAnyLayout; } else { PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str()) - exit(0); } } @@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) { return "ANY_LAYOUT"; default: PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ") - exit(0); break; } } diff --git a/src/framework/dim.h b/src/framework/dim.h index 85e86076e1de53fa80b75f56237901da49e22eb9..7c78659e3baacdf707dc46884c099dfd0cd284bb 100644 --- a/src/framework/dim.h +++ b/src/framework/dim.h @@ -42,7 +42,7 @@ struct Dim { : head(idx % size.head), tail(idx / size.head, size.tail) {} /** Construct a Dim with each dimension set to the given index */ - Dim(int64_t idx) : head(idx), tail(idx) {} + explicit Dim(int64_t idx) : head(idx), tail(idx) {} bool operator==(const Dim &o) const { return (head == o.head) && (tail == o.tail); @@ -65,7 +65,7 @@ template <> struct Dim<0> { static constexpr int dimensions = 0; - Dim(int64_t _head) {} + explicit Dim(int64_t _head) {} Dim() {} @@ -131,7 +131,6 @@ int64_t &indexer(Dim &dim, int idx) { template <> int64_t &indexer<0>(Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } template @@ -148,7 +147,6 @@ int64_t indexer(const Dim &dim, int idx) { template <> int64_t indexer<0>(const Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } } // namespace diff --git a/src/io/executor.cpp b/src/framework/executor.cpp similarity index 60% rename from src/io/executor.cpp rename to src/framework/executor.cpp index 9efec27c9df3d51a3411db87faee924b374d2ac7..c755924b8e3a1ede3f0d01ac418e1f3f04a0ffa9 100644 --- a/src/io/executor.cpp +++ b/src/framework/executor.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "io/executor.h" +#include "framework/executor.h" #include #include #include @@ -26,11 +26,24 @@ limitations under the License. */ #include "framework/program/var_desc.h" #include "framework/scope.h" #include "framework/tensor.h" -#include "operators/math/gemm.h" + +#ifdef PADDLE_EXECUTOR_MULTITHREAD +#include +#include +#include "common/threadpool.h" +#endif + +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" +#endif namespace paddle_mobile { +namespace framework { using framework::Variable; +using framework::Variable; + +#pragma mark - executor template Executor::Executor(const framework::Program p, int batch_size, @@ -390,15 +403,92 @@ std::vector::Ptype> Executor::Predict( const std::vector &input, const std::vector &dims) { framework::Tensor tensor(input, framework::make_ddim(dims)); std::shared_ptr output_tensor = Predict(tensor, 0); - Executor::Ptype *output_ptr = - output_tensor->data::Ptype>(); - std::vector::Ptype> result_vector; - for (int j = 0; j < output_tensor->numel(); ++j) { - result_vector.push_back(output_ptr[j]); + if (output_tensor != nullptr) { + Executor::Ptype *output_ptr = + output_tensor->data::Ptype>(); + std::vector::Ptype> result_vector; + for (int j = 0; j < output_tensor->numel(); ++j) { + result_vector.push_back(output_ptr[j]); + } + return result_vector; + } else { + DLOG << "return empty vector"; + return {}; } - return result_vector; } +#ifdef PADDLE_MOBILE_FPGA + +template +void Executor::InjectVariable(const framework::Tensor &t, + string var_name) { + framework::Variable *g_feed_value = program_.scope->Var(var_name); + framework::Tensor *feed_tensor = + g_feed_value->GetMutable(); + feed_tensor->Resize(t.dims()); + feed_tensor->ShareDataWith(t); +} + +template +void Executor::FeedData(const framework::Tensor &t) { + InjectVariable(t, "feed"); +} + +template +std::shared_ptr Executor::FetchResult(int id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(0); + auto &ops = ops_of_block_[*to_predict_block.get()]; + + PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range"); + auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id]; + auto output_map = last_op->Outputs(); + std::vector out_keys = last_op->GetOutKeys(); + PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output"); + auto *output_tensor = framework::GetVarValue( + out_keys[0], output_map, *(program_.scope)); + return std::make_shared(framework::Tensor(*output_tensor)); +} + +template +void Executor::Predict_From_To(int start, int end) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(0); + auto &ops = ops_of_block_[*to_predict_block.get()]; + end = end < 0 ? static_cast(ops.size()) : end; + PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(), + "start or end parameter is wrong"); + +#ifdef PADDLE_MOBILE_PROFILE + std::vector profile(ops.size()); +#endif + for (int i = start; i < end; i++) { +#ifdef PADDLE_MOBILE_PROFILE + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; +#endif + DLOG << "Running op: " << i << " " << ops[i]->Type(); + ops[i]->Run(); + +#ifdef PADDLE_MOBILE_PROFILE + clock_gettime(CLOCK_MONOTONIC, &ts); + profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; +#endif + } +} + +template +void Executor::Predict_From(int start) { + Predict_From_To(start); +} + +template +void Executor::Predict_To(int end) { + Predict_From_To(0, end); +} +#endif + #ifdef PADDLE_MOBILE_FPGA template void Executor::InjectVariable(const framework::Tensor &t, @@ -470,8 +560,232 @@ void Executor::Predict_To(int end) { } #endif +#ifdef PADDLE_MOBILE_CL +template +void Executor::LoadMemory(const framework::VarDesc var_desc, + float *tensorInput, char **data) {} + +template <> +void Executor::LoadMemory( + const framework::VarDesc var_desc, float *tensorInput, char **data) { + // 1. version + uint32_t version = *reinterpret_cast(*data); + + (*data) += sizeof(uint32_t); + + // 2 Lod information + uint64_t *lod_level_ptr = new uint64_t(); + memcpy(lod_level_ptr, (*data), sizeof(uint64_t)); + uint64_t lod_level = *lod_level_ptr; + delete lod_level_ptr; + (*data) += sizeof(uint64_t); + + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size = *reinterpret_cast(*data); + (*data) += sizeof(uint64_t); + std::vector tmp(size / sizeof(size_t)); + + for (int k = 0; k < tmp.size(); ++k) { + tmp[k] = *reinterpret_cast(*data); + (*data) += sizeof(size_t); + } + } + + // 3. tensor version + uint32_t tensor_version = *reinterpret_cast(*data); + (*data) += sizeof(uint32_t); + + // 4. tensor desc + int32_t size = *reinterpret_cast(*data); + (*data) += sizeof(int32_t); + + std::unique_ptr buf(new char[size]); + for (int m = 0; m < size; ++m) { + buf.get()[m] = (*data)[m]; + } + (*data) += (sizeof(char) * size); + + const framework::TensorDesc &desc = var_desc.Tensor_desc(); + int memory_size = 1; + for (auto l : desc.Dims()) { + memory_size *= l; + } + + void *memory = nullptr; + // int type_size = 0; + // switch (desc.DataType()) { + // case framework::VARTYPE_TYPE_FP16: + // type_size = 2; + // break; + // case framework::VARTYPE_TYPE_FP32: + // type_size = 4; + // memory = tensor->mutable_data(); + // break; + // case framework::VARTYPE_TYPE_FP64: + // type_size = 8; + // break; + // case framework::VARTYPE_TYPE_INT32: + // memory = tensor->mutable_data(); + // type_size = 4; + // break; + // case framework::VARTYPE_TYPE_INT64: + // type_size = 8; + // break; + // case framework::VARTYPE_TYPE_BOOL: + // type_size = 1; + // break; + // default: + // break; + // } + int type_size = 4; + memory = tensorInput; + if (program_.quantification) { + float min_value; + float max_value; + + memcpy(&min_value, *data, sizeof(float)); + memcpy(&max_value, *data + sizeof(float), sizeof(float)); + *data += 2 * sizeof(float); + const float factor = (max_value - min_value) / 255.0; + uint8_t *uint8_data = reinterpret_cast(*data); + for (int k = 0; k < memory_size; ++k) { + static_cast(memory)[k] = uint8_data[k] * factor + min_value; + } + *data += (memory_size * sizeof(uint8_t)); + } else { + for (int n = 0; n < memory_size; n++) { + float value; + memcpy(&value, *data + n * type_size, type_size); + if (value < 1e-30 && value > -1e-30) { + static_cast(memory)[n] = 0.0; + } else { + static_cast(memory)[n] = value; + } + } + (*data) += (sizeof(char) * memory_size * type_size); + } +} + +template <> +void Executor::InitMemory() { + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + CLImage *cl_image = nullptr; + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + var->template GetMutable(); + continue; + } else { + cl_image = var->template GetMutable(); + } + + char *origin_data = + ReadFileToBuff(program_.model_path + "/" + var_desc->Name()); + char *data = origin_data; + cl_context context = program_.scope->GetCLScpoe()->Context(); + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + int numel = 1; + for (auto l : desc.Dims()) { + numel *= l; + } + DLOG << var_desc->Name(); + float *tensorInput = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + LoadMemory(*var_desc, tensorInput, &data); + + framework::DDim ddim = framework::make_ddim(desc.Dims()); + + // has not init + cl_image->SetTensorData(tensorInput, ddim); + + delete origin_data; + paddle_mobile::memory::Free(tensorInput); + } else { + if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { + auto cl_image = var->template GetMutable(); + cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); + + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + // framework::DDim ddim = framework::make_ddim(desc.Dims()); + framework::DDim ddim = cl_image->dims(); + DLOG << var_desc->Name(); + cl_image->InitEmptyImage(context, command_queue, ddim); + } + } + } + } +} + +template <> +void Executor::InitCombineMemory() { + char *origin_data; + if (program_.combined_params_buf && program_.combined_params_len) { + LOG(kLOG_INFO) << "use outter memory"; + origin_data = reinterpret_cast(program_.combined_params_buf); + } else { + LOG(kLOG_INFO) << " begin init combine memory"; + origin_data = ReadFileToBuff(program_.para_path); + } + PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!"); + float *data = reinterpret_cast(origin_data); + + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + CLImage *cl_image = nullptr; + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + var->template GetMutable(); + continue; + } else { + cl_image = var->template GetMutable(); + } + + cl_context context = program_.scope->GetCLScpoe()->Context(); + + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + framework::DDim ddim = framework::make_ddim(desc.Dims()); + + int numel = 1; + for (int i = 0; i < ddim.size(); i++) { + numel = numel * ddim[i]; + } + float *tensorInput = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + LoadMemory(*var_desc, tensorInput, &origin_data); + + // has not init + cl_image->SetTensorData(tensorInput, ddim); + + paddle_mobile::memory::Free(tensorInput); + } else { + auto cl_image = var->template GetMutable(); + cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + framework::DDim ddim = cl_image->dims(); + // framework::DDim ddim = framework::make_ddim(desc.Dims()); + cl_image->InitEmptyImage(context, command_queue, ddim); + } + } + } + delete origin_data; + LOG(kLOG_INFO) << " end init combine memory "; +} + +#endif + template class Executor; -template class Executor; + template class Executor; +template class Executor; + +template class Executor; + +} // namespace framework } // namespace paddle_mobile diff --git a/src/io/executor.h b/src/framework/executor.h similarity index 95% rename from src/io/executor.h rename to src/framework/executor.h index 98906749effb7e46318157085c4505c57726ec62..be1c87e239c9c2ace9b4791f9769c176c9d5ef8e 100644 --- a/src/io/executor.h +++ b/src/framework/executor.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "framework/tensor.h" namespace paddle_mobile { +namespace framework { template class Executor { @@ -79,7 +80,10 @@ class Executor { void LoadMemory(void **data, const std::shared_ptr var_desc, framework::LoDTensor *tensor); - +#ifdef PADDLE_MOBILE_CL + void LoadMemory(const framework::VarDesc var_desc, float *tensorInput, + char **data); +#endif framework::Program program_; int batch_size_ = 1; std::shared_ptr to_predict_program_; @@ -97,4 +101,5 @@ class Executor { bool loddable_ = false; }; +} // namespace framework } // namespace paddle_mobile diff --git a/src/io/loader.cpp b/src/framework/loader.cpp similarity index 60% rename from src/io/loader.cpp rename to src/framework/loader.cpp index 7dd55950be240a88a7521d4be260416625419015..a434314730eb40b7e4017050a84a7d9742934396 100644 --- a/src/io/loader.cpp +++ b/src/framework/loader.cpp @@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "io/loader.h" +#include "framework/loader.h" #include "framework/lod_tensor.h" #include "framework/program/program-optimize/program_optimize.h" +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" +#endif namespace paddle_mobile { -using framework::Variable; +namespace framework { /** * muteandresize tensor as originProgramDesc and scope in loadParams @@ -26,23 +29,24 @@ using framework::Variable; * @param originProgramDesc * @param scope */ -void InitMemoryFromProgram( - std::shared_ptr &originProgramDesc, // NOLINT - std::shared_ptr &scope) { // NOLINT +template +void Loader::InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope) { for (const auto &block : originProgramDesc.get()->Blocks()) { for (const auto &var_desc : block->Vars()) { auto var = scope.get()->Var(var_desc->Name()); - if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { + if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Persistable()) { auto dim = var_desc->Tensor_desc().Dims(); - auto tensor = var->GetMutable(); - tensor->Resize(framework::make_ddim(dim)); + auto tensor = var->GetMutable(); + tensor->Resize(make_ddim(dim)); } else { auto dim = var_desc->Tensor_desc().Dims(); PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0"); dim[0] = 1; - auto tensor = var->GetMutable(); - tensor->Resize(framework::make_ddim(dim)); + auto tensor = var->GetMutable(); + tensor->Resize(make_ddim(dim)); } } else { // TODO(codeWorm): some. @@ -50,6 +54,36 @@ void InitMemoryFromProgram( } } } + +#ifdef PADDLE_MOBILE_CL +template <> +void Loader::InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope) { + for (const auto &block : originProgramDesc.get()->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = scope.get()->Var(var_desc->Name()); + if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { + if (var_desc->Persistable()) { + auto dim = var_desc->Tensor_desc().Dims(); + // auto tensor = var->GetMutable(); + auto cl_image = var->GetMutable(); + cl_image->Resize(make_ddim(dim)); + } else { + auto dim = var_desc->Tensor_desc().Dims(); + PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0"); + dim[0] = 1; + auto cl_image = var->GetMutable(); + cl_image->Resize(make_ddim(dim)); + } + } else { + // TODO(codeWorm): some. + } + } + } +} +#endif + /** * fusion and print someinfos * @tparam Dtype @@ -61,19 +95,18 @@ void InitMemoryFromProgram( */ template void FusionAndPrintInfos( - bool optimize, bool can_add_split, - framework::Program &program, // NOLINT - const std::shared_ptr &originProgramDesc) { + bool optimize, bool can_add_split, Program *program, + const std::shared_ptr &originProgramDesc) { if (optimize) { - framework::ProgramOptimize program_optimize; - program.optimizeProgram = + ProgramOptimize program_optimize; + program->optimizeProgram = program_optimize.FusionOptimize(originProgramDesc, can_add_split); - if (!program.optimizeProgram) { - program.optimizeProgram = originProgramDesc; + if (!program->optimizeProgram) { + program->optimizeProgram = originProgramDesc; } } if (optimize) { - program.optimizeProgram->Description("optimize: "); + program->optimizeProgram->Description("optimize: "); } else { originProgramDesc->Description("program: "); } @@ -102,9 +135,10 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { } template -const framework::Program Loader::Load( - const std::string &dirname, bool optimize, bool quantification, - bool can_add_split) { +const Program Loader::Load(const std::string &dirname, + bool optimize, + bool quantification, + bool can_add_split) { auto program = this->LoadProgram(dirname + "/__model__", optimize, quantification, can_add_split); program.model_path = dirname; @@ -112,9 +146,10 @@ const framework::Program Loader::Load( } template -const framework::Program Loader::Load( - const std::string &model_path, const std::string ¶_path, bool optimize, - bool quantification) { +const Program Loader::Load(const std::string &model_path, + const std::string ¶_path, + bool optimize, + bool quantification) { auto program = this->LoadProgram(model_path, optimize, quantification); program.para_path = para_path; @@ -124,7 +159,7 @@ const framework::Program Loader::Load( } template -const framework::Program Loader::LoadProgram( +const Program Loader::LoadProgram( const std::string &model_path, bool optimize, bool quantification, bool can_add_split) { std::string model_filename = model_path; @@ -141,29 +176,29 @@ const framework::Program Loader::LoadProgram( // DLOG << "n_ops: " << (*c_program->blocks)->n_ops; // - auto originProgramDesc = std::make_shared(c_program); + auto originProgramDesc = std::make_shared(c_program); - framework::Program program; + Program program; program.originProgram = originProgramDesc; program.quantification = quantification; program.combined_params_len = 0; program.combined_params_buf = nullptr; - auto scope = std::make_shared(); + auto scope = std::make_shared(); program.scope = scope; // use originProgramDesc and scope to init tensors InitMemoryFromProgram(originProgramDesc, scope); // perform fusion and print infos - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL); return program; } template -const framework::Program Loader::LoadCombinedMemory( +const Program Loader::LoadCombinedMemory( size_t read_size, const uint8_t *buf, size_t combined_params_len, - const uint8_t *combined_params_buf, bool optimize, bool quantification) { + uint8_t *combined_params_buf, bool optimize, bool quantification) { bool can_add_split = false; PaddleMobile__Framework__Proto__ProgramDesc *c_program; @@ -177,26 +212,31 @@ const framework::Program Loader::LoadCombinedMemory( DLOG << "n_ops: " << (*c_program->blocks)->n_ops; // - auto originProgramDesc = std::make_shared(c_program); + auto originProgramDesc = std::make_shared(c_program); - framework::Program program; + Program program; program.combined = true; program.originProgram = originProgramDesc; program.quantification = quantification; program.combined_params_len = combined_params_len; program.combined_params_buf = combined_params_buf; - auto scope = std::make_shared(); + auto scope = std::make_shared(); program.scope = scope; InitMemoryFromProgram(originProgramDesc, scope); - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, nullptr); return program; } template class Loader; + template class Loader; + template class Loader; +template class Loader; + +} // namespace framework } // namespace paddle_mobile diff --git a/src/framework/loader.h b/src/framework/loader.h new file mode 100644 index 0000000000000000000000000000000000000000..3200f0b25368fa123b80c51000cfd6c6a6d084b6 --- /dev/null +++ b/src/framework/loader.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "common/types.h" +#include "framework/program/program.h" + +namespace paddle_mobile { +namespace framework { + +template +class Loader { + public: + /* + * @b load separate format fluid model + * @b 加载分开形式的 fluid 模型 + * */ + const Program Load(const std::string &dirname, + bool optimize = false, + bool quantification = false, + bool can_add_split = false); + + /* + * @b load combine format fluid mode + * @b 加载结合在一起格式的模型 + * */ + const Program Load(const std::string &model_path, + const std::string ¶_path, + bool optimize = false, + bool quantification = false); + + const Program LoadCombinedMemory(size_t model_len, + const uint8_t *model_buf, + size_t combined_params_len, + uint8_t *combined_params_buf, + bool optimize = false, + bool quantification = false); + + private: + const Program LoadProgram(const std::string &model_path, + bool optimize = false, + bool quantification = false, + bool can_add_split = false); + + void InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h index 32954531d0854b3318185aacdf99314051f98f6a..219385ab1429fefddc9d380799259f7562e0030f 100644 --- a/src/framework/op_registry.h +++ b/src/framework/op_registry.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once +#include #include #include + #include "common/log.h" #include "common/type_define.h" #include "framework/op_info.h" @@ -120,5 +122,8 @@ class OpRegistry { #define REGISTER_OPERATOR_FPGA(op_type, op_class) \ REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA); +#define REGISTER_OPERATOR_CL(op_type, op_class) \ + REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL); + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 21b14dfcac682e7d310dcf4e8c47afaa0fb68fb3..ac2a4917db6d90045623b71d07624eb6d07de082 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -56,37 +56,69 @@ template void OperatorBase::CheckAllInputOutputSet() const {} template -void OperatorBase::Run() const { +void OperatorBase::Run() { + DLOG << " ----- Begin run impl --- " << type_ << " ----- "; RunImpl(); -#ifdef PADDLE_MOBILE_DEBUG - DLOG << "-------------" << type_ << "----------------------------"; - vector input_keys = GetInputKeys(); - for (const auto key : input_keys) { - auto var_vec_in = inputs_.at(key); - for (int i = 0; i < var_vec_in.size(); ++i) { - auto vari = scope_->FindVar(var_vec_in[i]); - if (vari->IsInitialized()) { - Tensor *tensor = vari->template GetMutable(); - if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; - } - } - } - for (const auto key : GetOutKeys()) { - auto var_vec_out = outputs_.at(key); - for (int i = 0; i < var_vec_out.size(); ++i) { - auto vari = scope_->FindVar(var_vec_out[i]); - if (vari->IsInitialized()) { - Tensor *tensor = vari->template GetMutable(); - if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; - } - } - } -#endif + DLOG << " ----- End run impl --- " << type_ << " ----- "; + //#ifdef PADDLE_MOBILE_DEBUG + // DLOG << "-------------" << type_ << "----------------------------"; + // vector input_keys = GetInputKeys(); + // for (const auto key : input_keys) { + // auto var_vec_in = inputs_.at(key); + // for (int i = 0; i < var_vec_in.size(); ++i) { + // auto vari = scope_->FindVar(var_vec_in[i]); + // if (vari->IsInitialized()) { + //#ifdef PADDLE_MOBILE_CL + // if (type_ == "feed") { + // Tensor *tensor = vari->template + // GetMutable(); if (tensor) DLOG << type_ << " + // input- " << key << "=" << *tensor; + // } else { + // CLImage *cl_image = vari->template + // GetMutable(); if (cl_image) { + // DLOG << type_ << " input- " << key << "=" << *cl_image; + // } + // } + // + //#else + // Tensor *tensor = vari->template GetMutable(); + // if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; + //#endif + // } + // } + // } + // for (const auto key : GetOutKeys()) { + // auto var_vec_out = outputs_.at(key); + // for (int i = 0; i < var_vec_out.size(); ++i) { + // auto vari = scope_->FindVar(var_vec_out[i]); + // if (vari->IsInitialized()) { + //#ifdef PADDLE_MOBILE_CL + // if (type_ == "fetch") { + // Tensor *tensor = vari->template + // GetMutable(); if (tensor) { + // DLOG << type_ << " output- " << key << "=" << *tensor; + // } + // } else { + // CLImage *cl_image = vari->template + // GetMutable(); if (cl_image) { + // DLOG << type_ << " output- " << key << "=" << *cl_image; + // } + // } + // + //#else + // Tensor *tensor = vari->template GetMutable(); + // if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; + //#endif + // } + // } + // } + //#endif } template class OperatorBase; template class OperatorBase; template class OperatorBase; +template class OperatorBase; } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/operator.h b/src/framework/operator.h index 5252ee65a2a80910500f4085bb92b80829f9e45b..fa7417a2975e224d9cac9bfdd4e28d73a34e019e 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include "common/enforce.h" @@ -31,7 +32,10 @@ limitations under the License. */ #include "framework/scope.h" #include "framework/tensor.h" #include "framework/variable.h" - +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_helper.h" +#include "framework/cl/cl_scope.h" +#endif namespace paddle_mobile { namespace framework { using std::string; @@ -59,10 +63,10 @@ class OperatorBase { const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope); virtual ~OperatorBase() {} - void Run() const; + void Run(); std::vector GetOutKeys() const; std::vector GetInputKeys() const; - virtual void RunImpl() const = 0; + virtual void RunImpl() = 0; virtual void Init() = 0; /* @@ -112,9 +116,13 @@ class OperatorWithKernel : public OperatorBase { const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope) : OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) {} + param_(inputs, outputs, attrs, *scope) { +#ifdef PADDLE_MOBILE_CL + kernel_.InitCLHelper(scope->GetCLScpoe()); +#endif + } - virtual void RunImpl() const { this->kernel_.Compute(this->param_); } + virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; @@ -123,6 +131,7 @@ class OperatorWithKernel : public OperatorBase { // DLOG << i.first; // DLOG << i.second; // } + PADDLE_MOBILE_ENFORCE(kernel_.Init(¶m_), " %s kernel init failed", this->type_.c_str()); } @@ -138,22 +147,35 @@ class OperatorWithKernel : public OperatorBase { template class OpKernelBase { public: - /* - * @b 所有kernel 需实现 Compute 方法 - * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, - * 所有结构体存在与: paddle-mobile/src/operators/op_param.h - * */ -#ifdef PADDLE_MOBILE_MALI_GPU + OpKernelBase() = default; + +#ifdef PADDLE_MOBILE_CL + virtual void InitCLHelper(CLScope *clScope) { + cl_helper_ = CLHelper(clScope); + } +#endif + + /* + * @b 所有kernel 需实现 Compute 方法 + * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, + * 所有结构体存在与: paddle-mobile/src/operators/op_param.h + * */ +#ifdef PADDLE_McOBILE_MALI_GPU OpKernelBase() { acl_op_ = nullptr; } void *GetAclOp() const { return acl_op_; } void SetAclOp(void *op, void *ob) const { reinterpret_cast *>(ob)->acl_op_ = op; } #endif - virtual void Compute(const P ¶) const = 0; + virtual void Compute(const P ¶) = 0; virtual bool Init(P *para) { return true; } virtual ~OpKernelBase() = default; + protected: +#ifdef PADDLE_MOBILE_CL + CLHelper cl_helper_; +#endif + private: #ifdef PADDLE_MOBILE_MALI_GPU void *acl_op_; diff --git a/src/framework/program/program.h b/src/framework/program/program.h index 696cf75b91ff88837cffd3304f5fe3cd491e77eb..6a25b1c40bd5c1b74ded54ee4134d71c77b15244 100644 --- a/src/framework/program/program.h +++ b/src/framework/program/program.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "framework/program/program_desc.h" #include "framework/scope.h" +#include + namespace paddle_mobile { namespace framework { @@ -32,7 +34,7 @@ class Program { bool combined = false; bool quantification = false; size_t combined_params_len; - const uint8_t *combined_params_buf; + uint8_t *combined_params_buf; }; } // namespace framework diff --git a/src/framework/scope.h b/src/framework/scope.h index 054f141ff68895e0879fd31e15d90c76ea038135..abc727231a0d119ab53d765ab020085aaab9102d 100644 --- a/src/framework/scope.h +++ b/src/framework/scope.h @@ -15,8 +15,14 @@ limitations under the License. */ #pragma once #include +#include #include -#include "variable.h" +#include + +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_scope.h" +#endif +#include "framework/variable.h" namespace paddle_mobile { namespace framework { @@ -33,6 +39,10 @@ class Scope { delete kid; } kids_.clear(); + +#ifdef PADDLE_MOBILE_CL + delete cl_scope_; +#endif } Scope &NewScope() const; @@ -72,6 +82,10 @@ class Scope { Variable *FindVarLocally(const std::string &name) const; +#ifdef PADDLE_MOBILE_CL + CLScope *GetCLScpoe() { return cl_scope_; } +#endif + private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const *parent) : parent_(parent) {} @@ -79,6 +93,10 @@ class Scope { mutable std::unordered_map vars_; mutable std::list kids_; Scope const *parent_{nullptr}; + +#ifdef PADDLE_MOBILE_CL + CLScope *cl_scope_ = new CLScope(); +#endif }; } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 496cde98e57561ca048f356fa397f5447b9050f5..99d642919b9c34378f7bb90f0b7aacd61aa75d0e 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -24,65 +24,24 @@ limitations under the License. */ #include #include "common/enforce.h" -#include "common/types.h" #include "framework/data_layout.h" -#include "framework/ddim.h" +#include "framework/tensor_base.h" #include "memory/t_malloc.h" namespace paddle_mobile { namespace framework { -template -struct SizeOfTypeFunctor; - -template -struct SizeOfTypeFunctor { - size_t operator()(std::type_index type) const { - if (typeid(T).hash_code() == type.hash_code()) { - return sizeof(T); - } else { - return 0UL; - } - } -}; - -template <> -struct SizeOfTypeFunctor<> { - size_t operator()(std::type_index type) const { return 0UL; } -}; - -template -struct SizeOfTypeFunctor { - size_t operator()(std::type_index type) const { - SizeOfTypeFunctor head; - size_t head_size = head(type); - if (head_size != 0) { - return head_size; - } - SizeOfTypeFunctor tail; - return tail(type); - } -}; - -static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor - functor; - size_t size = functor(type); - - PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); - return size; -} class LoDTensor; -class Tensor { +class Tensor : public TensorBase { public: - Tensor() : offset_(0) {} + Tensor() {} template - Tensor(std::vector input, DDim ddim) : offset_(0) { + Tensor(std::vector input, DDim ddim) { PADDLE_MOBILE_ENFORCE( input.size() == framework::product(ddim), "input vector'length should be equal to tensor's length"); + auto input_ptr = mutable_data(ddim); for (int i = 0; i < input.size(); ++i) { input_ptr[i] = input[i]; @@ -95,46 +54,6 @@ class Tensor { this->offset_ = inTensor.offset_; } - /*! Return a pointer to mutable memory block. */ - template - inline T *data() { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type().hash_code() == typeid(T).hash_code()), - "Tensor holds the wrong type, it holds %s", - this->holder_->type().name()); - - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - /*! Return a pointer to constant memory block. */ - template - inline const T *data() const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type().hash_code() == typeid(T).hash_code()), - "Tensor holds the wrong type, it holds %s ,requested:%s", - this->holder_->type().name(), typeid(T).name()); - - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } - - inline bool IsInitialized() const { return holder_ != nullptr; } - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T *mutable_data() { - static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(typeid(T))); - } - #ifdef PADDLE_MOBILE_DEBUG template inline void dump(std::string filename) const { @@ -151,6 +70,21 @@ class Tensor { } #endif + /*! Resize the dimensions of the memory block. */ + inline Tensor &Resize(const DDim &dims) { + dims_ = dims; + return *this; + } + + /*! The internal of two tensors share the same memory block. */ + inline Tensor &ShareDataWith(const Tensor &src) { + src.check_memory_size(); + if (holder_.get() != src.holder_.get()) { + *this = src; + } + return *this; + } + inline void *mutable_data(std::type_index type) { if (holder_ != nullptr) { holder_->set_type(type); @@ -165,6 +99,16 @@ class Tensor { reinterpret_cast(holder_->ptr()) + offset_); } + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T *mutable_data() { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(typeid(T))); + } + /** * @brief Return a pointer to mutable memory block. * @@ -180,27 +124,6 @@ class Tensor { return mutable_data(); } - /*! Return the dimensions of the memory block. */ - inline const DDim &dims() const { return dims_; } - - /*! Return the numel of the memory block. */ - inline int64_t numel() const { return product(dims_); } - - /*! Resize the dimensions of the memory block. */ - inline Tensor &Resize(const DDim &dims) { - dims_ = dims; - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareDataWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get()) { - *this = src; - } - return *this; - } - /** * @brief Return a sub-tensor of the given tensor. * @@ -234,44 +157,35 @@ class Tensor { } } - std::type_index type() const { + /*! Return a pointer to mutable memory block. */ + template + inline T *data() { + check_memory_size(); PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor not initialized yet when Tensor::type() is called.") - return holder_->type(); - } + (std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code()), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); - // memory size returns the holding memory size in byte. - size_t memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - offset_; + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); } - inline void check_memory_size() const { + /*! Return a pointer to constant memory block. */ + template + inline const T *data() const { + check_memory_size(); PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(), - "Tensor's dims_ is out of bound. "); + (std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code()), + "Tensor holds the wrong type, it holds %s ,requested:%s", + this->holder_->type().name(), typeid(T).name()); + + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); } private: - /** - * @note Placeholder hides type T, so it doesn't appear as a - * template - * parameter of Variable. - */ - struct Placeholder { - virtual ~Placeholder() = default; - - virtual void *ptr() const = 0; - - virtual size_t size() const = 0; - - virtual std::type_index type() const = 0; - - virtual void set_type(std::type_index type) = 0; - }; - struct PlaceholderImpl : public Placeholder { PlaceholderImpl(size_t size, std::type_index type) : ptr_(static_cast(memory::Alloc(size)), @@ -299,27 +213,6 @@ class Tensor { std::type_index type_; }; - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /** - * @brief points to elements dimensions. - * - * @note dims_ do not indicate the memory block size. - */ - - DDim dims_; - - /** - * @brief A PlaceHolder may be shared by more than one tensor. - * - * @note Some of them may be slices of the others. So the offset_ - * is introduced here to indicate the byte offset between - * PlaceHolder::ptr_ and where the tensor data really - * begins. - */ - size_t offset_; - #ifdef PADDLE_MOBILE_FPGA public: // NOLINT inline void reset_data_ptr(void *p) { diff --git a/src/framework/tensor_base.h b/src/framework/tensor_base.h new file mode 100644 index 0000000000000000000000000000000000000000..e1539d2e681973b39eeca5b30e2ed35b535be8cb --- /dev/null +++ b/src/framework/tensor_base.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "common/enforce.h" +#include "common/types.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace framework { + +template +struct SizeOfTypeFunctor; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor + functor; + size_t size = functor(type); + + PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + +class TensorBase { + public: + virtual inline TensorBase &Resize(const DDim &dims) = 0; + + inline bool IsInitialized() const { return holder_ != nullptr; } + + /*! Return the dimensions of the memory block. */ + inline const DDim &dims() const { return dims_; } + + /*! Return the numel of the memory block. */ + inline int64_t numel() const { return product(dims_); } + + std::type_index type() const { + PADDLE_MOBILE_ENFORCE( + holder_ != nullptr, + "Tensor not initialized yet when Tensor::type() is called.") + return holder_->type(); + } + + // memory size returns the holding memory size in byte. + size_t memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; + } + + inline void check_memory_size() const { + PADDLE_MOBILE_ENFORCE( + holder_ != nullptr, + "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(), + "Tensor's dims_ is out of bound. "); + } + + protected: + /** + * @note Placeholder hides type T, so it doesn't appear as a + * template + * parameter of Variable. + */ + struct Placeholder { + virtual ~Placeholder() = default; + + virtual void *ptr() const = 0; + + virtual size_t size() const = 0; + + virtual std::type_index type() const = 0; + + virtual void set_type(std::type_index type) = 0; + }; + + /** + * @brief points to elements dimensions. + * + * @note dims_ do not indicate the memory block size. + */ + + DDim dims_; + + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really + * begins. + */ + size_t offset_ = 0; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 6a7dff597af7fa5de06c90304136e81390fe06af..67f255315fa71acbf24f5071735020c0a435ce64 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -126,6 +126,8 @@ CreatePaddlePredictor( x.reset(new PaddleMobilePredictor(config)); } else if (config.device == PaddleMobileConfig::kGPU_MALI) { x.reset(new PaddleMobilePredictor(config)); + } else if (config.device == PaddleMobileConfig::kGPU_CL) { + x.reset(new PaddleMobilePredictor(config)); } else { LOG(kLOG_ERROR) << "unsupport device type!"; return nullptr; diff --git a/src/ios_io/PaddleMobileCPU.h b/src/io/ios_io/PaddleMobileCPU.h similarity index 100% rename from src/ios_io/PaddleMobileCPU.h rename to src/io/ios_io/PaddleMobileCPU.h diff --git a/src/ios_io/PaddleMobileCPU.mm b/src/io/ios_io/PaddleMobileCPU.mm similarity index 100% rename from src/ios_io/PaddleMobileCPU.mm rename to src/io/ios_io/PaddleMobileCPU.mm diff --git a/src/jni/PML.java b/src/io/jni/PML.java similarity index 100% rename from src/jni/PML.java rename to src/io/jni/PML.java diff --git a/src/jni/paddle_mobile_jni.cpp b/src/io/jni/paddle_mobile_jni.cpp similarity index 100% rename from src/jni/paddle_mobile_jni.cpp rename to src/io/jni/paddle_mobile_jni.cpp diff --git a/src/jni/paddle_mobile_jni.h b/src/io/jni/paddle_mobile_jni.h similarity index 100% rename from src/jni/paddle_mobile_jni.h rename to src/io/jni/paddle_mobile_jni.h diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 16756a61bf3265a0b6d7c2ec731d2c3d17bf9c3c..d37895d3aaa108edb1a8956ccbcb91cbe4b97725 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -44,7 +44,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -121,7 +121,7 @@ struct PaddleModelMemoryPack { struct PaddleMobileConfig : public PaddlePredictor::Config { enum Precision { FP32 = 0 }; - enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; + enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 }; enum Precision precision; enum Device device; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index ec1fd1af45319192585f60fa1f90500fa2deaf46..3cd7c38b2b102659739aefc66b4b25f61cc48bcf 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -28,13 +28,13 @@ bool PaddleMobile::Load(const std::string &dirname, bool optimize, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->Load(dirname, optimize, quantification), batch_size, optimize, loddable); } else { @@ -50,13 +50,13 @@ bool PaddleMobile::Load(const std::string &model_path, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->Load(model_path, para_path, optimize, quantification), batch_size, optimize, loddable); } else { @@ -67,21 +67,22 @@ bool PaddleMobile::Load(const std::string &model_path, } template -bool PaddleMobile::LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf) { +bool PaddleMobile::LoadCombinedMemory(size_t model_len, + const uint8_t *model_buf, + size_t combined_params_len, + uint8_t *combined_params_buf) { int batch_size = 1; bool optimise = true; bool quantification = false; if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, combined_params_buf, optimise, quantification), @@ -161,4 +162,6 @@ template class PaddleMobile; template class PaddleMobile; template class PaddleMobile; +template class PaddleMobile; + } // namespace paddle_mobile diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index e0ff51d246b179e3f91e1c94f3b26c5ff9ba3d8f..0e86fa988fe8a07131d3ea19fe7c606c27d70c2c 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -22,10 +22,10 @@ limitations under the License. */ #endif // _OPENMP #include "common/types.h" +#include "framework/executor.h" #include "framework/load_ops.h" +#include "framework/loader.h" #include "framework/tensor.h" -#include "io/executor.h" -#include "io/loader.h" namespace paddle_mobile { @@ -52,7 +52,7 @@ class PaddleMobile { bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf); + uint8_t *combined_params_buf); void SetThreadNum(int num); void Clear(); @@ -69,8 +69,8 @@ class PaddleMobile { #endif private: - std::shared_ptr> loader_; - std::shared_ptr> executor_; + std::shared_ptr> loader_; + std::shared_ptr> executor_; }; } // namespace paddle_mobile diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp index f820908404ea637d9680c32d5c4b5568e191dd7e..89220dd2489c93a84bc8a141c06a151b8044a4e4 100644 --- a/src/operators/batchnorm_op.cpp +++ b/src/operators/batchnorm_op.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef BATCHNORM_OP -#include "batchnorm_op.h" +#include "operators/batchnorm_op.h" #include "framework/op_proto_maker.h" #include "framework/op_registry.h" @@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp); #ifdef PADDLE_MOBILE_FPGA #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp); +#endif + #endif diff --git a/src/operators/bilinear_interp_op.h b/src/operators/bilinear_interp_op.h index 1b17406c546d336fd42b0a818d16627c87aedb09..2bb61d129d5ba45900f1c67b8c202e958a004bb7 100644 --- a/src/operators/bilinear_interp_op.h +++ b/src/operators/bilinear_interp_op.h @@ -40,10 +40,6 @@ class BilinearOp : public framework::OperatorWithKernel< DeviceType, BilinearInterpParam, operators::BilinearInterpKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, BilinearInterpParam, - operators::BilinearInterpKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h index c06ca8265dd495acb79e4e2ec6c497941b822b21..3a3048c6624996892333a71773c33ee2f6e18e0a 100644 --- a/src/operators/box_coder_op.h +++ b/src/operators/box_coder_op.h @@ -39,10 +39,6 @@ class BoxCoderOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::BoxCoderKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, BoxCoderParam, - operators::BoxCoderKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h index eb257d47228ab854c00574a001f6454e239cfbbd..a01e066edd1082bc109ba7eb0f31a2ac42ab865a 100644 --- a/src/operators/concat_op.h +++ b/src/operators/concat_op.h @@ -34,10 +34,6 @@ class ConcatOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ConcatKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConcatParam, - operators::ConcatKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp index c4601995219b32db75f22c7c2ed959e18af85f36..2c70f42f56530c2d21252d6b51c228e7c49ca8bf 100644 --- a/src/operators/conv_op.cpp +++ b/src/operators/conv_op.cpp @@ -62,4 +62,8 @@ REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp); REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(conv2d, ops::ConvOp); +#endif + #endif diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h index 23c022e584f9be6cb0b4c2c416ca96e61b3c131f..1b8bd70805ccff8946c1ab12a207618849fc9ca4 100644 --- a/src/operators/conv_op.h +++ b/src/operators/conv_op.h @@ -34,10 +34,6 @@ class ConvOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ConvKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::ConvKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/crf_op.h b/src/operators/crf_op.h index 9b7487ee958467dac451c3bcb743e6122842c7f1..dca481bb2dd08dc65fb94e41d0573277c9b143c7 100644 --- a/src/operators/crf_op.h +++ b/src/operators/crf_op.h @@ -37,10 +37,6 @@ class CrfOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::CrfKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, CrfParam, - operators::CrfKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h index 845c59a19e613bfcf299b445b778eff4d99c7295..102d65670d3e50acd15745e95b85d7b843994ed7 100644 --- a/src/operators/depthwise_conv_op.h +++ b/src/operators/depthwise_conv_op.h @@ -36,10 +36,6 @@ class DepthwiseConvOp : public framework::OperatorWithKernel< DeviceType, ConvParam, operators::DepthwiseConvKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::DepthwiseConvKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h index 65f3587c2336b3e581a30328c41ad397b2848b34..ce8acd5966439808f7a03f18cf3d29a1b5c0487e 100644 --- a/src/operators/dropout_op.h +++ b/src/operators/dropout_op.h @@ -38,10 +38,6 @@ class DropoutOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::DropoutKernel>( type, inputs, outputs, attrs, scope) {} - - // using framework::OperatorWithKernel, - // operators::DropoutKernel>; void InferShape() const override; protected: diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp index 93e447d51f0e9ce2fdf75c60332ad52950d68c3d..c956ee70b6b23cdf763cb01dd7c2798f4d6e9351 100644 --- a/src/operators/elementwise_add_op.cpp +++ b/src/operators/elementwise_add_op.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef ELEMENTWISEADD_OP -#include "elementwise_add_op.h" +#include "operators/elementwise_add_op.h" namespace paddle_mobile { namespace operators { @@ -36,4 +36,8 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp); +#endif + #endif diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h index a1360eba5480a46395cedb445a4df4e4ca0ab279..a853b40ff7ccf323911f2ea1bf6e23d67d111db2 100644 --- a/src/operators/elementwise_add_op.h +++ b/src/operators/elementwise_add_op.h @@ -37,10 +37,6 @@ class ElementwiseAddOp : public framework::OperatorWithKernel< DeviceType, ElementwiseAddParam, operators::ElementwiseAddKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ElementwiseAddParam, - operators::ElementwiseAddKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index 41f9e687bb4024d245a89df3dc785e1254b5a9a7..c3211b9fa9cc4b973788af4104c7ebe7bea2f54f 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -14,6 +14,19 @@ limitations under the License. */ #include "operators/feed_op.h" +namespace paddle_mobile { +namespace operators { + +template +void FeedOp::InferShape() const { + auto out_dims = this->param_.Out()->dims(); + out_dims[0] = this->param_.BatchSize(); + this->param_.Out()->Resize(out_dims); +} + +} // namespace operators +} // namespace paddle_mobile + namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU @@ -25,3 +38,6 @@ REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(feed, ops::FeedOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(feed, ops::FeedOp); +#endif diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index c7e77fcca40a3c533e442d10604c8cd9bcc1e74b..57932474184fd5431e5b6ac5756ab28faa2b1b9e 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -16,68 +16,29 @@ limitations under the License. */ #include #include "framework/operator.h" +#include "operators/kernel/feed_kernel.h" #include "operators/op_param.h" namespace paddle_mobile { namespace operators { +using std::string; + template -class FeedOp : public framework::OperatorBase { +class FeedOp + : public framework::OperatorWithKernel, + FeedKernel> { public: FeedOp(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap attrs, std::shared_ptr scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, scope.get()) {} - - void InferShape() const { - auto out_dims = param_.Out()->dims(); - out_dims[0] = param_.BatchSize(); - param_.Out()->Resize(out_dims); - } - -#ifdef PADDLE_MOBILE_FPGA - - void Init() { - Tensor *output = param_.Out(); - fpga::format_fp16_ofm(output); - } - - void RunImpl() const { - auto input = (Tensor *)const_cast(param_.InputX()); // NOLINT - fpga::format_image(input); - auto input_ptr = input->data(); - Tensor *output = param_.Out(); - auto output_ptr = output->data(); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; - - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = (void *)input_ptr; // NOLINT - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - } -#else - void Init() {} - void RunImpl() const { - param_.Out()->ShareDataWith(*param_.InputX()); - param_.Out()->set_lod(param_.InputX()->lod()); - } -#endif + : framework::OperatorWithKernel, + FeedKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; protected: - FeedParam param_; }; } // namespace operators diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp index 6c5d1341db12db5e602bad08aaa33f26b2ac3396..50e53c30cfd06a8fae8c9e18dd4aa985a056a13e 100644 --- a/src/operators/fetch_op.cpp +++ b/src/operators/fetch_op.cpp @@ -13,6 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/fetch_op.h" +namespace paddle_mobile { +namespace operators { + +template +void FetchOp::InferShape() const { + auto x_dims = this->param_.InputX()->dims(); + this->param_.Out()->Resize(x_dims); +} + +} // namespace operators +} // namespace paddle_mobile namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU @@ -24,3 +35,6 @@ REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fetch, ops::FetchOp); +#endif diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h index 9fbfc2f417b52162950612beb2979fe640cbdcc4..f92c66a05f121b3f6b78c244dd01d81393fa5c68 100644 --- a/src/operators/fetch_op.h +++ b/src/operators/fetch_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "framework/operator.h" +#include "operators/kernel/fetch_kernel.h" #include "operators/op_param.h" namespace paddle_mobile { @@ -23,25 +24,20 @@ namespace operators { using std::string; template -class FetchOp : public framework::OperatorBase { +class FetchOp + : public framework::OperatorWithKernel, + FetchKernel> { public: FetchOp(const string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap attrs, std::shared_ptr scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, *scope) {} - void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } + : framework::OperatorWithKernel, + FetchKernel>( + type, inputs, outputs, attrs, scope) {} - void Init() {} - - void InferShape() const { - auto x_dims = param_.InputX()->dims(); - param_.Out()->Resize(x_dims); - } + void InferShape() const override; protected: - FetchParam param_; }; } // namespace operators diff --git a/src/operators/fill_constant_op.cpp b/src/operators/fill_constant_op.cpp index 6d7c4f44f1b769c47d6f741d139118158292a40f..0c13c57ceb53933c750f8c1adaa8b4e24ff948c8 100644 --- a/src/operators/fill_constant_op.cpp +++ b/src/operators/fill_constant_op.cpp @@ -20,9 +20,6 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp); #endif -#ifdef PADDLE_MOBILE_MALI_GPU -REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp); -#endif #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp); #endif diff --git a/src/operators/fill_constant_op.h b/src/operators/fill_constant_op.h index 78eb162efc8ccd42b9fba363d49d1dbc4052f6b2..e24cecd363630a845f147e2e429b973dad24f63d 100644 --- a/src/operators/fill_constant_op.h +++ b/src/operators/fill_constant_op.h @@ -37,7 +37,7 @@ class FillConstantOp : public framework::OperatorBase { : framework::OperatorBase(type, inputs, outputs, attrs, scope), param_(inputs, outputs, attrs, *scope) {} - void RunImpl() const { + void RunImpl() { auto data_type = static_cast<_PaddleMobile__Framework__Proto__VarType__Type>( param_.DataDtype()); diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h index e935ae308cf5c28b9c435086b2b5e4d4407c319a..a7a91e60701cf559cb35238aa2966c02c869e844 100644 --- a/src/operators/flatten_op.h +++ b/src/operators/flatten_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #pragma once #include +#include #include "framework/operator.h" #include "operators/kernel/flatten_kernel.h" @@ -53,10 +54,6 @@ class FlattenOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::FlattenKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FlattenParam, - operators::FlattenKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h index 7893ff95a671447adbeebeeaf4096235e7a37964..4ec76b500812f95eb64e27564d0e63b2c1b2c2d3 100644 --- a/src/operators/fusion_conv_add_add_prelu_op.h +++ b/src/operators/fusion_conv_add_add_prelu_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" @@ -67,10 +68,6 @@ class FusionConvAddAddPReluOp DeviceType, FusionConvAddAddPReluParam, operators::ConvAddAddPReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddAddPReluParam, - operators::ConvAddAddPReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp index e7d6ee59f2dadbdca0af72af1e786f0430c58d63..b9bc948fe0e77741a36f959e29eb2a4c82e82b72 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.cpp +++ b/src/operators/fusion_conv_add_bn_relu_op.cpp @@ -58,5 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); #endif - +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); +#endif #endif diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h index 07bb0146b3f481e09d0a944c4791237e7eea08e4..6ecc9bdc4a90530221c70651c52457874e3eaaa8 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.h +++ b/src/operators/fusion_conv_add_bn_relu_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -66,10 +66,6 @@ class FusionConvAddBNReluOp DeviceType, FusionConvAddBNReluParam, operators::ConvAddBNReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddBNReluParam, - operators::ConvAddBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp index 485ba1be9baee2034dbd5c47f64372b701026e44..1b32ec39b65f8b16fd8967be3f45f4b31db5ca16 100644 --- a/src/operators/fusion_conv_add_op.cpp +++ b/src/operators/fusion_conv_add_op.cpp @@ -58,4 +58,8 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp); +#endif + #endif diff --git a/src/operators/fusion_conv_add_op.h b/src/operators/fusion_conv_add_op.h index 365e3afa97c2c2fd82c629302f8a5fddf8abb406..eef143ce8716ce856784bb01dd3d58a26746b4e8 100644 --- a/src/operators/fusion_conv_add_op.h +++ b/src/operators/fusion_conv_add_op.h @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_add_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -56,10 +56,6 @@ class FusionConvAddOp : public framework::OperatorWithKernel< FusionConvAddParam, operators::ConvAddKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddParam, - operators::ConvAddKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h index 0b0763e781daf3d882d0463205b07fdef53b90f5..fc1143099e16b8b7f7c44d7fe5a5694a278a1906 100644 --- a/src/operators/fusion_conv_add_prelu_op.h +++ b/src/operators/fusion_conv_add_prelu_op.h @@ -39,10 +39,7 @@ class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher { std::vector> *removed_nodes) { node->Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}} - - }, - + {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}}, removed_nodes); } std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; } @@ -63,9 +60,6 @@ class FusionConvAddPReluOp operators::ConvAddPReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionConvAddPReluParam, - operators::ConvAddPReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp index 486221f0f6b2e1b0d78d2632c8d735a6a6a101bb..bb4b6666a881de0989d43840806b9d5d720b3b66 100644 --- a/src/operators/fusion_conv_add_relu_op.cpp +++ b/src/operators/fusion_conv_add_relu_op.cpp @@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp); #endif - +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp); +#endif #endif diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h index cde465f266f9eb829794bd295b70dc789f013ee0..22ba67c617ecdb0f3be2f5757504b6ba530b092c 100644 --- a/src/operators/fusion_conv_add_relu_op.h +++ b/src/operators/fusion_conv_add_relu_op.h @@ -56,9 +56,6 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel< operators::ConvAddReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_bn_add_relu_op.h b/src/operators/fusion_conv_bn_add_relu_op.h index b2f911363acc4f9d5b3c4407317107efadf3996d..303668a89bf7869e72a4b546c5d96be24b26c4ec 100644 --- a/src/operators/fusion_conv_bn_add_relu_op.h +++ b/src/operators/fusion_conv_bn_add_relu_op.h @@ -17,11 +17,12 @@ limitations under the License. */ #pragma once #include +#include #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_bn_add_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -71,10 +72,6 @@ class FusionConvBNAddReluOp DeviceType, FusionConvBNAddReluParam, operators::ConvBNAddReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvBNAddReluParam, - operators::ConvBNAddReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h index a6bbe72500ccfe2b43e21496c5abc18b9a562d47..9bc534fe333c76e8f533c904560b8228760c66e5 100644 --- a/src/operators/fusion_conv_bn_relu_op.h +++ b/src/operators/fusion_conv_bn_relu_op.h @@ -63,10 +63,6 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel< DeviceType, FusionConvBNReluParam, operators::ConvBNReluKernel>(type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvBNReluParam, - operators::ConvBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h index 44a1f845bc9b2dc0251fb729de9f9c00071fd492..d7a74d896e904971e21c28fab29771b34a049921 100644 --- a/src/operators/fusion_dwconv_bn_relu_op.h +++ b/src/operators/fusion_dwconv_bn_relu_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/dwconv_bn_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -65,9 +65,6 @@ class FusionDWConvBNReluOp operators::DWConvBNReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionDWConvBNReluParam, - operators::DWConvBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h index 722c5225bc035df2761154a08a521a09b34a1e82..26cb40aac8e47203f125417e1f6b5df75d7835b5 100644 --- a/src/operators/fusion_fc_op.h +++ b/src/operators/fusion_fc_op.h @@ -56,10 +56,6 @@ class FusionFcOp : public framework::OperatorWithKernel< operators::FusionFcKernel>( type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionFcParam, - operators::FusionFcKernel>::OperatorWithKernel; - void InferShape() const override; }; diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h index 5cd884f04e819ac881c3b2a4ad666591ea610117..7324f94138e59c4a4a93fe2658b38ddbdf6fa651 100644 --- a/src/operators/fusion_fc_relu_op.h +++ b/src/operators/fusion_fc_relu_op.h @@ -56,9 +56,6 @@ class FusionFcReluOp : public framework::OperatorWithKernel< operators::FusionFcReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionFcReluParam, - operators::FusionFcReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/gru_op.h b/src/operators/gru_op.h index a45d3efe5b4c59f8582c534f85de7cc1ac82df85..5e66b497af15c498e2af5ff5903ef88a16db1832 100644 --- a/src/operators/gru_op.h +++ b/src/operators/gru_op.h @@ -37,10 +37,6 @@ class GruOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::GruKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, GruParam, - operators::GruKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h index 50d5664c1a3ce999a0c163225d20126961804a22..036b496ca8293432aa30ae86542e78880143f086 100644 --- a/src/operators/im2sequence_op.h +++ b/src/operators/im2sequence_op.h @@ -16,15 +16,14 @@ limitations under the License. */ #pragma once -#include +#include #include "framework/operator.h" #include "operators/kernel/im2sequence_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { -using namespace framework; - template class Im2SequenceOp : public framework::OperatorWithKernel< DeviceType, Im2SequenceParam, @@ -39,9 +38,6 @@ class Im2SequenceOp : public framework::OperatorWithKernel< operators::Im2SequenceKernel>(type, inputs, outputs, attrs, scope) {} - // using framework::OperatorWithKernel< - // DeviceType, Im2SequenceParam, - // operators::Im2SequenceKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp index c420727f425092240994ee834117225c72abeec2..f31c4426db7d28234692742fcd670cb26ec50ab0 100644 --- a/src/operators/kernel/arm/batchnorm_kernel.cpp +++ b/src/operators/kernel/arm/batchnorm_kernel.cpp @@ -26,8 +26,7 @@ bool BatchNormKernel::Init(BatchNormParam *param) { } template <> -void BatchNormKernel::Compute( - const BatchNormParam ¶m) const { +void BatchNormKernel::Compute(const BatchNormParam ¶m) { BatchnormCompute(param); } diff --git a/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/src/operators/kernel/arm/bilinear_interp_kernel.cpp index 4888f7a37a47fe80ffcbaee7e3f80b1d5c1f20f4..85192e28edf8351bd8be540b27aa986b2c458d0d 100644 --- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp +++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp @@ -27,7 +27,7 @@ bool BilinearInterpKernel::Init(BilinearInterpParam *param) { template <> void BilinearInterpKernel::Compute( - const BilinearInterpParam ¶m) const { + const BilinearInterpParam ¶m) { BilinearInterpCompute(param); } diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp index b769d4fbbaa7570ee741476f960d9e5b60c61917..30ede12dffe0eed7673c9ae1f7c836fd1b5b7096 100644 --- a/src/operators/kernel/arm/box_coder_kernel.cpp +++ b/src/operators/kernel/arm/box_coder_kernel.cpp @@ -26,8 +26,7 @@ bool BoxCoderKernel::Init(BoxCoderParam *param) { } template <> -void BoxCoderKernel::Compute( - const BoxCoderParam ¶m) const { +void BoxCoderKernel::Compute(const BoxCoderParam ¶m) { BoxCoderCompute(param); } diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp index 04c590e6b432fbf88cd136eac942485adf9a9003..8cdf6cb01afa85239bfd0d48bbce02790ba5250d 100644 --- a/src/operators/kernel/arm/concat_kernel.cpp +++ b/src/operators/kernel/arm/concat_kernel.cpp @@ -26,7 +26,7 @@ bool ConcatKernel::Init(ConcatParam *param) { } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) const { +void ConcatKernel::Compute(const ConcatParam ¶m) { ConcatCompute(param); param.Out()->set_lod(param.Inputs()[0]->lod()); } diff --git a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp index 74b88f5d4f5e24b1401803c8c48d99319f412d1b..2f6f5f3ac719b3fd32aac54ce36eb534f7d99dd7 100644 --- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp @@ -28,7 +28,7 @@ bool ConvAddAddPReluKernel::Init( template <> void ConvAddAddPReluKernel::Compute( - const FusionConvAddAddPReluParam ¶m) const { + const FusionConvAddAddPReluParam ¶m) { ConvAddAddPReluCompute(param); } template class ConvAddAddPReluKernel; diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp index ca53ebea8e4577fdc52fad066691d4351eaf12f9..eb55920621db34d191a9536f287ec50747e1ce3c 100644 --- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp @@ -55,7 +55,7 @@ bool ConvAddBNReluKernel::Init( template <> void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) const { + const FusionConvAddBNReluParam ¶m) { ConvAddBNReluCompute(param); } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp index 1af1c3db1159cd4fed007ebf153ba15b804eee75..e016b8efbd15472ae0d77423d84dc19671bfa316 100644 --- a/src/operators/kernel/arm/conv_add_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_kernel.cpp @@ -25,8 +25,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { } template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) const { +void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { ConvAddCompute(param); } diff --git a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp index 5930cfdcfc0f983c9f07754113dc37d5122d19f0..f04a9a7d746f2d970196945707bd05409c5fa340 100644 --- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp @@ -27,7 +27,7 @@ bool ConvAddPReluKernel::Init(FusionConvAddPReluParam *param) { template <> void ConvAddPReluKernel::Compute( - const FusionConvAddPReluParam ¶m) const { + const FusionConvAddPReluParam ¶m) { ConvAddPReluCompute(param); } template class ConvAddPReluKernel; diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp index f50e1e3900bb5fce35a29100d6c2cb6004b4af74..211d6d8487bfd4afc71d74e5ecbff149ad34e466 100644 --- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp @@ -27,7 +27,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { template <> void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) const { + const FusionConvAddReluParam ¶m) { ConvAddReluCompute(param); } template class ConvAddReluKernel; diff --git a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp index 785b13dde2ec1196792d17b253bb0d904da799f5..a0f21dd6126ed81cf5e96f99bd0f8ed5211f96a4 100644 --- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp @@ -55,7 +55,7 @@ bool ConvBNAddReluKernel::Init( template <> void ConvBNAddReluKernel::Compute( - const FusionConvBNAddReluParam ¶m) const { + const FusionConvBNAddReluParam ¶m) { ConvBNAddReluCompute(param); } template class ConvBNAddReluKernel; diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp index 6b9ea0428fa496980a234c7c895ef9cbf1245b51..d8acb8d2083b732da026a9bff19c2d7732568597 100644 --- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp @@ -57,7 +57,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { template <> void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) const { + const FusionConvBNReluParam ¶m) { ConvBNReluCompute(param); } template class ConvBNReluKernel; diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 4e9d3a34f231485685bc8f7b087382cb99a3b036..0d67bdc656f2ba9ad674c18c9cefbd7d9cd711df 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -26,7 +26,7 @@ bool ConvKernel::Init(ConvParam *param) { } template <> -void ConvKernel::Compute(const ConvParam ¶m) const { +void ConvKernel::Compute(const ConvParam ¶m) { ConvCompute(param); } diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp index 94f8a79101ca4b1f4085a4d172fee761714dc3d2..771a846ed65e5c69090698ce813103077dedaccf 100644 --- a/src/operators/kernel/arm/conv_transpose_kernel.cpp +++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp @@ -27,7 +27,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { template <> void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) const { + const ConvTransposeParam ¶m) { ConvTransposeCompute(param); } diff --git a/src/operators/kernel/arm/crf_kernel.cpp b/src/operators/kernel/arm/crf_kernel.cpp index 89769c50a6fc05b28192ebf584ba3cb12f19ac2c..d30c28b3576e2a8a8a108ae6c86edc2f4310b83f 100644 --- a/src/operators/kernel/arm/crf_kernel.cpp +++ b/src/operators/kernel/arm/crf_kernel.cpp @@ -27,7 +27,7 @@ bool CrfKernel::Init(CrfParam *param) { } template <> -void CrfKernel::Compute(const CrfParam ¶m) const { +void CrfKernel::Compute(const CrfParam ¶m) { CrfCompute(param); } diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index fd5e068afb6f7f2a069a7d8fccc459d4c2a6828d..000d59baa8c804201cbd2e2a731c2077196b698f 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -26,8 +26,7 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { } template <> -void DepthwiseConvKernel::Compute( - const ConvParam ¶m) const { +void DepthwiseConvKernel::Compute(const ConvParam ¶m) { DepthwiseConvCompute(param); } diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp index cd6c8d17f1ea05e3df6f8f364c2d3d5c9976e46b..ea893730c1148158f574fb6c467265b334ba2f45 100644 --- a/src/operators/kernel/arm/dequantize_kernel.cpp +++ b/src/operators/kernel/arm/dequantize_kernel.cpp @@ -29,8 +29,7 @@ bool DequantizeKernel::Init(DequantizeParam *param) { } template <> -void DequantizeKernel::Compute( - const DequantizeParam ¶m) const { +void DequantizeKernel::Compute(const DequantizeParam ¶m) { const Tensor *input = param.input_; Tensor *output = param.out_; float activation_scale = param.activation_scale_->data()[0]; diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp index 4578ac6607d87c316853f6201f02f8204bc41de1..964773ad696ea53fccec62a394f00fa70daf7145 100644 --- a/src/operators/kernel/arm/dropout_kernel.cpp +++ b/src/operators/kernel/arm/dropout_kernel.cpp @@ -27,7 +27,7 @@ bool DropoutKernel::Init(DropoutParam *para) { template struct DropoutFunctor { - DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} + explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} inline T operator()(T in) const { return (1 - dropout_pro_) * in; } private: @@ -35,7 +35,7 @@ struct DropoutFunctor { }; template <> -void DropoutKernel::Compute(const DropoutParam ¶m) const { +void DropoutKernel::Compute(const DropoutParam ¶m) { const auto *input_x = param.InputX(); auto *input_x_ptr = input_x->data(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp index b85701bb936b2ccc0323e4d534424abb726a69be..f92d9a273467bf15d9d7fad43237af5385d3d54e 100644 --- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp @@ -54,7 +54,7 @@ bool DWConvBNReluKernel::Init(FusionDWConvBNReluParam *param) { template <> void DWConvBNReluKernel::Compute( - const FusionDWConvBNReluParam ¶m) const { + const FusionDWConvBNReluParam ¶m) { DWConvBNReluCompute(param); } template class DWConvBNReluKernel; diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp index 9c6f4a3316385b803a8fdb833490f1fe9e7f41ac..043d27e72f16ab4b38f31d6cff60bd2f4e89a649 100644 --- a/src/operators/kernel/arm/elementwise_add_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { template <> void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) const { + const ElementwiseAddParam ¶m) { ElementwiseAddCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/src/operators/kernel/arm/elementwise_mul_kernel.cpp index 00205952a2567aae5927e318c494c90bc4a5ffbb..9c245707da31d07e2419439c68343f7014beb416 100644 --- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { template <> void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) const { + const ElementwiseMulParam ¶m) { ElementwiseMulCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/src/operators/kernel/arm/elementwise_sub_kernel.cpp index d78b3e31098ef7ef929a0d2c00043fab7193b01c..30f607155c4a91f4f523c6596f09c2379970108c 100644 --- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseSubKernel::Init(ElementwiseSubParam *param) { template <> void ElementwiseSubKernel::Compute( - const ElementwiseSubParam ¶m) const { + const ElementwiseSubParam ¶m) { ElementwiseSubCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/feed_kernel.cpp b/src/operators/kernel/arm/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..598f6df01b16683f4d6e06f6418a2930a7ec8736 --- /dev/null +++ b/src/operators/kernel/arm/feed_kernel.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); + param.Out()->set_lod(param.InputX()->lod()); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/arm/fetch_kernel.cpp b/src/operators/kernel/arm/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..62d0e678891e4f54471f95de08242a3e72f7a385 --- /dev/null +++ b/src/operators/kernel/arm/fetch_kernel.cpp @@ -0,0 +1,26 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_CONVADD_OP +#include "operators/kernel/fetch_kernel.h" +namespace paddle_mobile { +namespace operators { +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} +template class FetchKernel; +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/arm/flatten_kernel.cpp b/src/operators/kernel/arm/flatten_kernel.cpp index ef4fe913c4800526f46daa75760afe82fdbee591..4d00e494544557ce05f2af16bb59979ea2b8927f 100644 --- a/src/operators/kernel/arm/flatten_kernel.cpp +++ b/src/operators/kernel/arm/flatten_kernel.cpp @@ -26,7 +26,7 @@ bool FlattenKernel::Init(FlattenParam *param) { } template <> -void FlattenKernel::Compute(const FlattenParam ¶m) const { +void FlattenKernel::Compute(const FlattenParam ¶m) { FlattenCompute(param); } diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp index d9d112e7a762705efe041c74eea9ddb7d5162918..c503edab643def7af0585a18d774b14ca0a3c39d 100644 --- a/src/operators/kernel/arm/fusion_fc_kernel.cpp +++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp @@ -26,8 +26,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { } template <> -void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { +void FusionFcKernel::Compute(const FusionFcParam ¶m) { FusionFcCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/gru_kernel.cpp b/src/operators/kernel/arm/gru_kernel.cpp index 168471185e07a9c1814c708238996a82c1ee0891..a4e89ff42a3d70c0a9a3d1bd7316e18d015a0926 100644 --- a/src/operators/kernel/arm/gru_kernel.cpp +++ b/src/operators/kernel/arm/gru_kernel.cpp @@ -26,7 +26,7 @@ bool GruKernel::Init(GruParam *param) { } template <> -void GruKernel::Compute(const GruParam ¶m) const { +void GruKernel::Compute(const GruParam ¶m) { GruCompute(param); param.OutHidden()->set_lod(param.InputInput()->lod()); // DLOG << "________________" << param.OutHidden()->dims(); diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp index cc6ae2ae8bc7cde9b365817ba9cafc19776da913..07ce0314fa08467d4fc63bc0745a49b8a3b2f263 100644 --- a/src/operators/kernel/arm/im2sequence_kernel.cpp +++ b/src/operators/kernel/arm/im2sequence_kernel.cpp @@ -33,7 +33,7 @@ inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0, template <> void Im2SequenceKernel::Compute( - const Im2SequenceParam ¶m) const { + const Im2SequenceParam ¶m) { const Tensor *in_x = param.Input(); framework::LoDTensor *out = param.Output(); out->mutable_data(); @@ -56,7 +56,7 @@ void Im2SequenceKernel::Compute( out->mutable_data({batch_size * output_height * output_width, img_channels * kernels[0] * kernels[1]}); const std::vector dilations({1, 1}); - // TODO: verify + // TODO(): verify auto out_dims = out->dims(); out->Resize({batch_size, out->numel() / batch_size}); for (int i = 0; i < batch_size; i++) { diff --git a/src/operators/kernel/arm/lookup_kernel.cpp b/src/operators/kernel/arm/lookup_kernel.cpp index 584c497c701bd0598e0a151774fe60b7c7fee718..0e6df6ab6bf19f67b0c5f5a873d4a47215167e45 100644 --- a/src/operators/kernel/arm/lookup_kernel.cpp +++ b/src/operators/kernel/arm/lookup_kernel.cpp @@ -25,7 +25,7 @@ bool LookupKernel::Init(LookupParam *param) { } template <> -void LookupKernel::Compute(const LookupParam ¶m) const { +void LookupKernel::Compute(const LookupParam ¶m) { LookupCompute(param); param.Out()->set_lod(param.InputIds()->lod()); } diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp index 3ec1bdd9a0e2ebbce555eef944fe56750505430f..bf049053fc5b9157f24c50233742eea3c0ca2de1 100644 --- a/src/operators/kernel/arm/lrn_kernel.cpp +++ b/src/operators/kernel/arm/lrn_kernel.cpp @@ -26,7 +26,7 @@ bool LrnKernel::Init(LrnParam *param) { } template <> -void LrnKernel::Compute(const LrnParam ¶m) const { +void LrnKernel::Compute(const LrnParam ¶m) { LrnCompute(param); } diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp index 276281f963e449af9d55f7c5ca58ef5da17e6f93..59d16600d71d247c42bb7625a3dddd5952a33705 100644 --- a/src/operators/kernel/arm/mul_kernel.cpp +++ b/src/operators/kernel/arm/mul_kernel.cpp @@ -26,7 +26,7 @@ bool MulKernel::Init(MulParam *param) { } template <> -void MulKernel::Compute(const MulParam ¶m) const { +void MulKernel::Compute(const MulParam ¶m) { MulCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp index 938f81cf485eb64f408c0fb274eeec673349e306..61638da0051c7b27b695752c445f0fd6b20114b5 100644 --- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp +++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp @@ -27,7 +27,7 @@ bool MultiClassNMSKernel::Init(MultiClassNMSParam *param) { template <> void MultiClassNMSKernel::Compute( - const MultiClassNMSParam ¶m) const { + const MultiClassNMSParam ¶m) { MultiClassNMSCompute(param); } diff --git a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp index e72c29135e9898d3b5342d1c4b4f0176f105a62a..1ae11aba41f1b2dbd9207e0808990a262bb80f56 100644 --- a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp +++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp @@ -28,7 +28,7 @@ bool PolygonBoxTransformKernel::Init( template <> void PolygonBoxTransformKernel::Compute( - const PolygonBoxTransformParam ¶m) const { + const PolygonBoxTransformParam ¶m) { PolygonBoxTransformCompute(param); } diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp index 60d6f1401876b957649d08889218b88cf1fe5eef..58d6359efa48b0db215269a631e7e4cb57c429d9 100644 --- a/src/operators/kernel/arm/pool_kernel.cpp +++ b/src/operators/kernel/arm/pool_kernel.cpp @@ -25,7 +25,7 @@ bool PoolKernel::Init(PoolParam *param) { } template <> -void PoolKernel::Compute(const PoolParam ¶m) const { +void PoolKernel::Compute(const PoolParam ¶m) { PoolCompute(param); } } // namespace operators diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp index e1ec927fb13d1f4a2e600d46f65f2806448059d9..591bd644165f1a271a879073b27429d1780cbfb5 100644 --- a/src/operators/kernel/arm/prelu_kernel.cpp +++ b/src/operators/kernel/arm/prelu_kernel.cpp @@ -35,7 +35,7 @@ struct PReluFunctor { * @b 特化到具体平台的实现, param 从 op 层传入 * */ template <> -void PReluKernel::Compute(const PReluParam ¶m) const { +void PReluKernel::Compute(const PReluParam ¶m) { auto *x = param.InputX(); auto *alpha = param.InputAlpha(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp index 71011fa2112f36d573b5bdc55f1b5bf92318c448..c067d3388dd928b032178add99c6567a8add20d3 100644 --- a/src/operators/kernel/arm/prior_box_kernel.cpp +++ b/src/operators/kernel/arm/prior_box_kernel.cpp @@ -26,8 +26,7 @@ bool PriorBoxKernel::Init(PriorBoxParam *param) { } template <> -void PriorBoxKernel::Compute( - const PriorBoxParam ¶m) const { +void PriorBoxKernel::Compute(const PriorBoxParam ¶m) { PriorBoxCompute(param); } diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index 11a1f0a53d4886e1a07d258b76b3827671471dca..17f442abe4e03d936eb3b317d5b6f164ac0924e7 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -279,8 +279,7 @@ bool QuantizeKernel::Init(QuantizeParam *param) { } template <> -void QuantizeKernel::Compute( - const QuantizeParam ¶m) const { +void QuantizeKernel::Compute(const QuantizeParam ¶m) { float max_abs = 0.f; const Tensor *input = param.input_; Tensor *output = param.out_; diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp index 6e04e6013aa8dd5c50dcc22a720b470b08ecd648..8ee103484eb753913e5554b64d6dac523066322a 100644 --- a/src/operators/kernel/arm/relu_kernel.cpp +++ b/src/operators/kernel/arm/relu_kernel.cpp @@ -26,7 +26,7 @@ bool ReluKernel::Init(ReluParam *param) { } template <> -void ReluKernel::Compute(const ReluParam ¶m) const { +void ReluKernel::Compute(const ReluParam ¶m) { ReluCompute(param); } diff --git a/src/operators/kernel/arm/reshape2_kernel.cpp b/src/operators/kernel/arm/reshape2_kernel.cpp index 83bbf112abb8b5e290126d6909a0fe77291f8fac..093105f906da2287015417ec05b709aebd4a1fb2 100644 --- a/src/operators/kernel/arm/reshape2_kernel.cpp +++ b/src/operators/kernel/arm/reshape2_kernel.cpp @@ -26,8 +26,7 @@ bool Reshape2Kernel::Init(Reshape2Param *param) { } template <> -void Reshape2Kernel::Compute( - const Reshape2Param ¶m) const { +void Reshape2Kernel::Compute(const Reshape2Param ¶m) { Reshape2Compute(param); } diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp index 235288ae13e2c557e6f7310727f5d8e6e83cedf6..800808f9c23cd07d17f8207b9b51e96d3feb34f3 100644 --- a/src/operators/kernel/arm/reshape_kernel.cpp +++ b/src/operators/kernel/arm/reshape_kernel.cpp @@ -26,7 +26,7 @@ bool ReshapeKernel::Init(ReshapeParam *param) { } template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) const { +void ReshapeKernel::Compute(const ReshapeParam ¶m) { ReshapeCompute(param); } diff --git a/src/operators/kernel/arm/resize_kernel.cpp b/src/operators/kernel/arm/resize_kernel.cpp index 5c0c186554a31454447b1df47a1b7573fd948fb9..b53b7545e33c929fe0b55bccd68e7b955db0d676 100644 --- a/src/operators/kernel/arm/resize_kernel.cpp +++ b/src/operators/kernel/arm/resize_kernel.cpp @@ -108,7 +108,7 @@ void ResizeTensor(const Tensor* src, Tensor* dst) { } template <> -void ResizeKernel::Compute(const ResizeParam& param) const { +void ResizeKernel::Compute(const ResizeParam& param) { const auto* input_x = param.InputX(); const auto& input_x_dims = input_x->dims(); auto* out = param.Out(); diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp index 299132ea00f40838249022c45d994e7d88547eaa..bded56275f80741c552d4978bb238d6f0d6339db 100644 --- a/src/operators/kernel/arm/scale_kernel.cpp +++ b/src/operators/kernel/arm/scale_kernel.cpp @@ -23,7 +23,7 @@ namespace operators { * @b 特化到具体平台的实现, param 从 op 层传入 * */ template <> -void ScaleKernel::Compute(const ScaleParam ¶m) const { +void ScaleKernel::Compute(const ScaleParam ¶m) { const auto *input_x = param.InputX(); auto *input_x_ptr = input_x->data(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/shape_kernel.cpp b/src/operators/kernel/arm/shape_kernel.cpp index 1687cfb4cdaf12eb2be9d465a83b82034b59f7cc..4adbf8fa1321c57330b480068ff1f7df7454d7e6 100644 --- a/src/operators/kernel/arm/shape_kernel.cpp +++ b/src/operators/kernel/arm/shape_kernel.cpp @@ -26,7 +26,7 @@ bool ShapeKernel::Init(ShapeParam *param) { } template <> -void ShapeKernel::Compute(const ShapeParam ¶m) const { +void ShapeKernel::Compute(const ShapeParam ¶m) { ShapeCompute(param); } diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp index 7912fd8762b693cd40c632d6b152406ed4b0c568..3d6e14ffea80169172431229e34309cde331d588 100644 --- a/src/operators/kernel/arm/sigmoid_kernel.cpp +++ b/src/operators/kernel/arm/sigmoid_kernel.cpp @@ -32,7 +32,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { } template <> -void SigmoidKernel::Compute(const SigmoidParam ¶m) const { +void SigmoidKernel::Compute(const SigmoidParam ¶m) { SigmoidCompute(param); } diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp index f86a10601aa3a67300736f2f4c751c05bf41a781..d5a1009fd79d57d8815d313ed61bbc5d7bf32bbe 100644 --- a/src/operators/kernel/arm/softmax_kernel.cpp +++ b/src/operators/kernel/arm/softmax_kernel.cpp @@ -26,7 +26,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { } template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) const { +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { SoftmaxCompute(param); } diff --git a/src/operators/kernel/arm/split_kernel.cpp b/src/operators/kernel/arm/split_kernel.cpp index d2ca34f764adc50154fb58e3a6248f9311bbface..13c7567e3db137f0c579ad0e33b1856aaf8334f2 100644 --- a/src/operators/kernel/arm/split_kernel.cpp +++ b/src/operators/kernel/arm/split_kernel.cpp @@ -26,7 +26,7 @@ bool SplitKernel::Init(SplitParam *param) { } template <> -void SplitKernel::Compute(const SplitParam ¶m) const { +void SplitKernel::Compute(const SplitParam ¶m) { SplitCompute(param); } diff --git a/src/operators/kernel/arm/sum_kernel.cpp b/src/operators/kernel/arm/sum_kernel.cpp index 0290037522a2bf3b3c88ce129eda277a401fecb5..2b36a382a1681b08e5f6c87b9031492e81a579cd 100644 --- a/src/operators/kernel/arm/sum_kernel.cpp +++ b/src/operators/kernel/arm/sum_kernel.cpp @@ -26,7 +26,7 @@ bool SumKernel::Init(SumParam *param) { } template <> -void SumKernel::Compute(const SumParam ¶m) const { +void SumKernel::Compute(const SumParam ¶m) { SumCompute(param); param.Out()->set_lod(param.Inputs()[0]->lod()); } diff --git a/src/operators/kernel/arm/transpose2_kernel.cpp b/src/operators/kernel/arm/transpose2_kernel.cpp index 656d2768840a52f50c42d3797018aa9aec037783..228f210ea1c52f1bfe601bd46f741347dabd6cce 100644 --- a/src/operators/kernel/arm/transpose2_kernel.cpp +++ b/src/operators/kernel/arm/transpose2_kernel.cpp @@ -25,8 +25,7 @@ bool Transpose2Kernel::Init(Transpose2Param *param) { } template <> -void Transpose2Kernel::Compute( - const Transpose2Param ¶m) const { +void Transpose2Kernel::Compute(const Transpose2Param ¶m) { Transpose2Compute(param); } diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp index bb7a881bdc1d2706a25a77833ca38695ede2fec7..f90376eb507253badb209838a3db4bafbcfbb5b9 100644 --- a/src/operators/kernel/arm/transpose_kernel.cpp +++ b/src/operators/kernel/arm/transpose_kernel.cpp @@ -25,8 +25,7 @@ bool TransposeKernel::Init(TransposeParam *param) { } template <> -void TransposeKernel::Compute( - const TransposeParam ¶m) const { +void TransposeKernel::Compute(const TransposeParam ¶m) { TransposeCompute(param); } diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h index beac7399583d074956fa4564fdd9312b2d7985f0..1f2db456d360d6eb6c684fb98e3807b07cc89b92 100644 --- a/src/operators/kernel/batchnorm_kernel.h +++ b/src/operators/kernel/batchnorm_kernel.h @@ -22,13 +22,11 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { -using namespace framework; - template class BatchNormKernel : public framework::OpKernelBase> { public: - void Compute(const BatchNormParam ¶m) const; + void Compute(const BatchNormParam ¶m); bool Init(BatchNormParam *param); }; diff --git a/src/operators/kernel/bilinear_interp_kernel.h b/src/operators/kernel/bilinear_interp_kernel.h index ac3dfcb16190315f72dc60da54c4f944874e4458..9a68fe65a562a8567dab2e5977506e083f7889a2 100644 --- a/src/operators/kernel/bilinear_interp_kernel.h +++ b/src/operators/kernel/bilinear_interp_kernel.h @@ -29,7 +29,7 @@ class BilinearInterpKernel : public framework::OpKernelBase> { public: - void Compute(const BilinearInterpParam& param) const; + void Compute(const BilinearInterpParam& param); bool Init(BilinearInterpParam* param); }; } // namespace operators diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h index 58144a87349ed3a6504e0074903594be3aa6fe8f..eadb21b3d5ecb95ef82cfef2ac8c3245e925ec7c 100644 --- a/src/operators/kernel/box_coder_kernel.h +++ b/src/operators/kernel/box_coder_kernel.h @@ -29,7 +29,7 @@ template class BoxCoderKernel : public framework::OpKernelBase> { public: - void Compute(const BoxCoderParam& param) const; + void Compute(const BoxCoderParam& param); bool Init(BoxCoderParam* param); }; } // namespace operators diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d5695cb80736dcc126ce5f726c0a2566884fe45 --- /dev/null +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef BATCHNORM_OP + +#include "operators/kernel/batchnorm_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool BatchNormKernel::Init(BatchNormParam *param) { + this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl"); + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + const int C = mean->numel(); + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + framework::CLImage *new_scale = new framework::CLImage(); + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + framework::CLImage *new_bias = new framework::CLImage(); + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + return true; +} + +template <> +void BatchNormKernel::Compute( + const BatchNormParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY()); + + auto input = param.InputX()->GetCLImage(); + auto out = param.OutputY()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + const int out_width = default_work_size[1]; + + clSetKernelArg(kernel, 1, sizeof(int), &out_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &out); + + // cl_event out_event = param.OutputY()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class BatchNormKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..9d0857a45e0766482e2dbb6ded77edb07517bc0f --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void batchnorm(__private const int out_width, + __read_only image2d_t input, + __read_only image2d_t new_scale_image, + __read_only image2d_t new_bias_image, + __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0)); + half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0)); + + int pos_x = mad24(out_c, out_width, out_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); + half4 out = mad(in, new_scale, new_bias); + + write_imageh(output, (int2)(pos_x, out_nh), out); +} diff --git a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..1f2e36687ab04be2b8c18b26e868b7709bc3c231 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x/w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in + biase; + write_imageh(outputImage,coords,output); + } diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h new file mode 100644 index 0000000000000000000000000000000000000000..34f36eb9a3ffbdc5781c974926ea4a7d5258636b --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/cl_common.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +inline half4 activation(half4 in +#ifdef PRELU + , + half4 prelu_alpha +#endif +) { + half4 output; +#ifdef PRELU + output = select(prelu_alpha * in, in, in >= (half4)0.0); +#endif + +#ifdef RELU + output = fmax(in, (half4)(0.0f)); +#endif + return output; +} diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..aa3eaedda5634294f231831d550296dfdba0dd48 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -0,0 +1,19 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define BATCH_NORM +#define RELU + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b8bf7e7d7d9fbb9eb9e930e9c1c3a58bb3391efc --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..8d686c20dfaa31204a4c44105fb479423352fb9e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define RELU +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl new file mode 100644 index 0000000000000000000000000000000000000000..db3c8d3ca74dd25a827fcb594728ce81bfc1078a --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -0,0 +1,527 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* +conv +conv_bn +conv_add +conv_relu +conv_bn_relu +conv_add_relu +conv_add_bn_relu +*/ + +#include "cl_common.h" + +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input[9]; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x ; + int filter_y = pos_in_filter_block.y ; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + half4 filters[9]; + filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); + filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); + filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); + filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); + filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); + filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); + filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); + filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + + for(int i = 0 ;i < 9 ; i++){ + output += inputs[i] * filters[i]; + } +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + write_imageh(output_image, output_pos, output); + +} + + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); +/* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); +*/ + + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} + + + +/* + +__kernel void conv_1x1_4(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width, + __private const int input_height, + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0) * 4; + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0)); +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input.x, weight0_0, output0); + output0 = mad(input.y, weight0_1, output0); + output0 = mad(input.z, weight0_2, output0); + output0 = mad(input.w, weight0_3, output0); + + half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0)); + half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1)); + half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2)); + half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3)); + + output1 = mad(input.x, weight1_0, output1); + output1 = mad(input.y, weight1_1, output1); + output1 = mad(input.z, weight1_2, output1); + output1 = mad(input.w, weight1_3, output1); + + half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0)); + half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1)); + half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2)); + half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3)); + + output2 = mad(input.x, weight2_0, output2); + output2 = mad(input.y, weight2_1, output2); + output2 = mad(input.z, weight2_2, output2); + output2 = mad(input.w, weight2_3, output2); + + half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0)); + half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1)); + half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2)); + half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3)); + + output3 = mad(input.x, weight3_0, output3); + output3 = mad(input.y, weight3_1, output3); + output3 = mad(input.z, weight3_2, output3); + output3 = mad(input.w, weight3_3, output3); + + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0)); + +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); +#endif + + int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos0, output0); + + + int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos1, output1); + + + int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos2, output2); + + + int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos3, output3); +} + +*/ diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..3c3497f917d8a16c7c7e304edf00a4250066dce7 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl @@ -0,0 +1,18 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define BATCH_NORM +#define RELU +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..f304764868959ce028a8448c4d311db878cc1f6e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in + biase; + write_imageh(outputImage,coords,output); + } diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..80d741d859af633299120bfec9f4cfeeaeb47194 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w) + { + int i = get_global_id(0); + int j = get_global_id(1); + half4 pixel; + pixel.x = convert_half(in[(i * w + j)]); + pixel.y = convert_half(in[h * w + (i * w + j)]); + pixel.z = convert_half(in[2 * h * w + (i * w + j)]); + pixel.w = 0.0; + int2 coords; + coords.x = j; + coords.y = i; + + write_imageh(outputImage,coords,pixel); + } diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..64bb1845b0bd2c04c8761845b90dbed9e391a77b --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void fetch(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global float* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh)); + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_float(in.x); + out[index + size_ch] = convert_float(in.y); + out[index + size_ch * 2] = convert_float(in.z); + out[index + size_ch * 3] = convert_float(in.w); +} + +__kernel void fetch_2d(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global float* out) { + const int in_w = get_global_id(1); + const int in_h = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(in_w, in_h)); + + const int index = (in_h * in_width + in_w) * 4; + out[index] = convert_float(in.x); + out[index + 1] = convert_float(in.y); + out[index + 2] = convert_float(in.z); + out[index + 3] = convert_float(in.w); +} diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..fc660941f8863a0056c4618f0207ae69533d3242 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define MIN_VALUE -FLT_MAX + +__kernel void pool_max( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 max_value = (half4)(MIN_VALUE); + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + max_value = max(max_value, tmp); + } + } + + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), max_value); +} + +__kernel void pool_avg( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 sum = (half4)(0.0f); + int num = 0; + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + num++; + } + } + half4 avg = sum / num; + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), avg); +} diff --git a/src/operators/kernel/cl/cl_kernel/relu.cl b/src/operators/kernel/cl/cl_kernel/relu.cl new file mode 100644 index 0000000000000000000000000000000000000000..cc8f9c3742f7794c51a5e04ac4edde617af0e388 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/relu.cl @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void relu(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); + write_imageh(output, (int2)(x, y), in); +} + +__kernel void relu_p0(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); + write_imageh(output, (int2)(x, y), in); +} +__kernel void relu_p1(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + write_imageh(output, (int2)(x, y), in); +} diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl new file mode 100644 index 0000000000000000000000000000000000000000..0ffc64f15cd531879de4852f976769790b6bafe4 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/reshape.cl @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void reshape(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3, + __private const int x0, + __private const int x1, + __private const int x2, + __private const int x3) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + + write_imageh(output, (int2)(x, y), in); +} + + +/* + +__kernel void reshape(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3, + __private const int x0, + __private const int x1, + __private const int x2, + __private const int x3) { + const int x = get_global_id(0); + const int y = get_global_id(1); + int obx = x / x3; + int oby = y / x2; + int ox = x % x3; + int oy = y % x2; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 r; + for (int i = 0; i < 4; i++) { + int t = obx * 4 + i; + if (t > x1) break; + int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy; + int i3 = oindex % d3; oindex /= d3; + int i2 = oindex % d2; oindex /= d2; + int i1 = oindex % d1; oindex /= d1; + int i0 = oindex; + int ix = (i1 / 4) * d3 + i3; + int iy = i0 * d2 + i2; + half4 p = read_imageh(input, sampler, (int2)(ix, iy)); + ((half*)&r)[i] = ((half*)&p)[i1%4]; + } + write_imageh(output, (int2)(x, y), r); +} + +*/ diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl new file mode 100644 index 0000000000000000000000000000000000000000..215ec69fc283dcb2b538300cb5591b2b9e4b6a13 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/softmax.cl @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void softmax(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int group + ) { + const int out_c = get_global_id(0); // block index + const int out_w = get_global_id(1); // index in one block + const int out_nh = get_global_id(2); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half maxv = 0.0f; + for (int i = 0; i < group; ++i) { + half4 temp = read_imageh(input_image, sampler, (int2)(i, 0)); + maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w)))); + } + + + half4 rsum = (half4)(0.0f); + for (int i = 0; i < group; ++i) { + half4 r = read_imageh(input_image, sampler, (int2)(i, 0)); + rsum += convert_half4(exp(convert_float4(r - maxv))); + } + + float sum = rsum.x + rsum.y + rsum.z + rsum.w; + + half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh)); + half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum); + write_imageh(output_image, (int2)(out_w, out_nh), result); +} + +/* + +__kernel void softmax(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3) { + const int z = get_global_id(0); + const int x = get_global_id(1); + const int y = get_global_id(2); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 cv = read_imageh(input, sampler, (int2)(x, y)); + half4 maxv = cv; + for (int i = 0; i < d3; i++) { + half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y)); + maxv = max(maxv, temp); + } + half4 sum = (half4)0.0f; + // half4 x = = (half4)0.0f; + for (int i = 0; i < d3; i++) { + half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y)); + sum += exp(temp - maxv); + } + half4 r = exp(cv - maxv) / sum; + + write_imageh(output, (int2)(z * d3 + x, y), r); +} + +*/ diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..33172e4f0343f1bb26e34f6c7d3b009629b60430 --- /dev/null +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -0,0 +1,289 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "framework/cl/cl_image.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNReluKernel::Init( + FusionConvAddBNReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + param->Bias()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // const CL *mean = param->InputMean(); + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + const int C = mean->numel(); + + // for (int j = 0; j < C; ++j) { + // DLOG << " mean - " << j << mean->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " variance - " << j << variance->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " scale - " << j << scale->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " bias - " << j << bias->data()[j]; + // } + + // + // DLOG << " climage mean: " << *mean; + // DLOG << " climage variance: " << *variance; + // DLOG << " climage scale: " << *scale; + // DLOG << " climage bias: " << *bias; + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + framework::CLImage *new_scale = new framework::CLImage(); + + // for (int j = 0; j < C; ++j) { + // DLOG << " new scale - " << j << new_scale_ptr[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " new bias - " << j << new_bias_ptr[j]; + // } + + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - y bias: " << *(param->Bias()); + // + // DLOG << " climage - new scale: " << *new_scale; + + framework::CLImage *new_bias = new framework::CLImage(); + + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - new bias: " << *new_bias; + // + // DLOG << " climage - filter: " << *(param->Filter()); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + + param->SetOffset(offset); + + /* + if (param->Filter()->dims()[2] == 1 && + param->Filter()->dims()[3] == 1 && + (param->Filter()->dims()[0] % 16) == 0) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv 1x1 4"; + } + */ + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv 1x1"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu depth_conv_3x3"; + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv_3x3"; + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddBNReluKernel::Compute( + const FusionConvAddBNReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto biase = param.Bias()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + // DLOG << " c block " << c_block; + // DLOG << " w " << w; + // DLOG << " nh " << nh; + // DLOG << " stride " << stride; + // DLOG << " offset " << offset; + // DLOG << " input_c " << input_c; + // DLOG << " dilation " << dilation; + // DLOG << " input width " << input_width; + // DLOG << " input height " << input_height; + // DLOG << " output width " << output_width; + // DLOG << " output height " << output_height; + // DLOG << " input dim " << param.Input()->dims(); + // DLOG << " output dim " << param.Output()->dims(); + // DLOG << " filter dim " << param.Filter()->dims(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + /* + if (param.Filter()->dims()[2] == 1 && + param.Filter()->dims()[3] == 1 && + param.Filter()->dims()[0] % 16 == 0) { + DLOG << " before modifi work size: " << default_work_size; + + default_work_size[0] = default_work_size[0] / 4; + + DLOG << " modification work size: " << default_work_size; + DLOG << " input dims " << param.Input()->dims(); + DLOG << " output dims " << param.Output()->dims(); + DLOG << " filter dims: " << param.Filter()->dims(); + DLOG << " biase dims : " << param.Bias()->dims(); + + } + */ + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddBNReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e30c6d31db645fb5d18bf70ef5b6876a5f683da --- /dev/null +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -0,0 +1,149 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/conv_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddKernel::Init(FusionConvAddParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddKernel::Compute( + const FusionConvAddParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..814cff634cb0c4c2d5dd6e6706b558bb1cd64f22 --- /dev/null +++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDRELU_OP + +#include "operators/kernel/conv_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddReluKernel::Init( + FusionConvAddReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddReluKernel::Compute( + const FusionConvAddReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..05cefadce052fb65664cc797c800ec67e43f3a2c --- /dev/null +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#include "operators/kernel/conv_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvKernel::Init(ConvParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + auto filter_ddim = param->Filter()->dims(); + + std::vector filter_shape( + {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); + framework::DDim ddim = framework::make_ddim(filter_shape); + if (filter_ddim[1] == 1) { + param->Filter()->Resize(ddim); + } + + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + DLOG << " init helper: " << &cl_helper_; + DLOG << " conv kernel add kernel ~ "; + DLOG << " width of one block: " << param->Filter()->dims()[3]; + DLOG << " height of one block: " << param->Filter()->dims()[2]; + DLOG << " filter dims: " << param->Filter()->dims(); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + DLOG << " here1 "; + this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl"); + + } else if (param->Filter()->dims()[0] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + DLOG << " here2 "; + this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + DLOG << " here3 "; + this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + DLOG << " begin set kernel arg "; + DLOG << " c block " << c_block; + DLOG << " w " << w; + DLOG << " nh " << nh; + DLOG << " stride " << stride; + DLOG << " offset " << offset; + DLOG << " input_c " << input_c; + DLOG << " dilation " << dilation; + DLOG << " input width " << input_width; + DLOG << " input height " << input_height; + DLOG << " output width " << output_width; + DLOG << " output height " << output_height; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35813a31f570c8daf956e4c90d0f3e3de1675eb4 --- /dev/null +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DepthwiseConvKernel::Init(ConvParam *param) { + DLOG << " depthwise conv kernel init begin "; + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " depthwise conv kernel init end "; + return true; +} + +template <> +void DepthwiseConvKernel::Compute( + const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); +} + +template class DepthwiseConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e62714b3fa3182706270627e7fd1a13b06f3b66a --- /dev/null +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEADD_OP + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddKernel::Init( + ElementwiseAddParam *param) { + DLOG << "-----init add-----"; + CLImage *bias = (CLImage *)(param->InputY()); + bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); + DLOG << " bias: " << *bias; + if (bias->dims().size() == 4) { + this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + } else if (param->InputY()->dims().size() == 1) { + this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); + } else { + DLOG << "error:bias dims is error"; + } + + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) { + auto input = param.InputX(); + auto bias = param.InputY(); + auto output = param.Out(); + cl_int status; + auto kernel = this->cl_helper_.KernelAt(0); + if (bias->dims().size() == 4) { + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + int width = input->ImageWidth(); + int height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias->dims().size() == 1) { + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[3]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + int width = input->ImageWidth(); + int height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + cl_event out_event = param.Out()->GetClEvent(); + cl_event wait_event = param.InputX()->GetClEvent(); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + DLOG << "error:bias dims is error"; + } +} + +template class ElementwiseAddKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad5fb9cdbcd00dad56579297c010c3912e3dca24 --- /dev/null +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" +#include "framework/cl/cl_tensor.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + DLOG << "Init feed"; + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + cl_int status; + auto output = param.Out(); + const Tensor *input = param.InputX(); + // DLOG << *input; + const float *input_data = input->data(); + int numel = input->numel(); + cl_mem cl_image = output->GetCLImage(); + int height = output->dims()[2]; + int width = output->dims()[3]; + CLTensor input_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + input_cl_tensor.Resize(input->dims()); + cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input_data); + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height); + CL_CHECK_ERRORS(status); + + size_t global_work_size[2] = {width, height}; + + // cl_event out_event = param.Out()->GetClEvent(); + + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31c1d4179cbdfc8145d90bee2353be821e65b40b --- /dev/null +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/fetch_kernel.h" +#include "framework/cl/cl_tensor.h" +// #include "common/common.h" +// #include + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + if (param->InputX()->dims().size() <= 2) { + this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); + } else { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + } + auto *out = param->Out(); + out->mutable_data(); + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); + + auto input = param.InputX()->GetCLImage(); + auto *out = param.Out(); + + const auto &dim = param.InputX()->dims(); + size_t new_dims[] = {1, 1, 1, 1}; + + for (int j = 0; j < dim.size(); ++j) { + new_dims[4 - dim.size() + j] = dim[j]; + } + + size_t C, in_height, in_width; + + C = new_dims[1]; + in_height = new_dims[2]; + if (dim.size() <= 2) { + in_width = param.InputX()->ImageWidth(); + } else { + in_width = new_dims[3]; + } + + CLTensor out_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + out_cl_tensor.Resize(out->dims()); + cl_mem outBuffer = out_cl_tensor.mutable_data(); + + clSetKernelArg(kernel, 0, sizeof(int), &in_height); + clSetKernelArg(kernel, 1, sizeof(int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); + if (dim.size() > 2) { + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + clSetKernelArg(kernel, 4, sizeof(int), &size_ch); + clSetKernelArg(kernel, 5, sizeof(int), &size_block); + clSetKernelArg(kernel, 6, sizeof(int), &size_batch); + } + + // cl_event wait_event = param.InpdutX()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + // auto time1 = paddle_mobile::time(); + + // printf(" before finish \n"); + // clFlsh(this->cl_helper_.CLCommandQueue()); + clFinish(this->cl_helper_.CLCommandQueue()); + // printf(" after finish \n"); + + // auto time2 = paddle_mobile::time(); + // + // + // std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2) + // << "ms" << std::endl; + + memcpy(out->data(), out_cl_tensor.Data(), out->memory_size()); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/pool_kernel.cpp b/src/operators/kernel/cl/pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df79ababadd4c1b959a1eb0fe237a45ab97a6bd8 --- /dev/null +++ b/src/operators/kernel/cl/pool_kernel.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP + +#include "operators/kernel/pool_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool PoolKernel::Init(PoolParam *param) { + std::string pooling_type = param->PoolingType(); + this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl"); + return true; +} + +template <> +void PoolKernel::Compute(const PoolParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + + auto input = param.Input()->GetCLImage(); + auto out = param.Output()->GetCLImage(); + + framework::CLImageConverterFolder *input_folder_converter = + reinterpret_cast( + param.Input()->Converter()); + framework::CLImageConverterFolder *output_folder_converter = + reinterpret_cast( + param.Output()->Converter()); + + const int in_height = input_folder_converter->HeightOfOneBlock(); + const int in_width = input_folder_converter->WidthOfOneBlock(); + const int out_height = output_folder_converter->HeightOfOneBlock(); + const int out_width = output_folder_converter->WidthOfOneBlock(); + + std::string pooling_type = param.PoolingType(); + std::vector ksize = param.Ksize(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + const int pad_top = paddings[0]; + const int pad_left = paddings[1]; + const int stride_h = strides[0]; + const int stride_w = strides[1]; + const int ksize_h = ksize[0]; + const int ksize_w = ksize[1]; + + clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height); + clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height); + clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width); + clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top); + clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left); + clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h); + clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w); + clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h); + clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w); + clSetKernelArg(kernel, 10, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 11, sizeof(cl_mem), &out); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class PoolKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3acfe442201a9be59c6f0a0a536cf9aea68c4a2 --- /dev/null +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef RELU_OP + +#include "operators/kernel/relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReluKernel::Init(ReluParam* param) { + this->cl_helper_.AddKernel("relu", "relu.cl"); + // this->cl_helper_.AddKernel("relu_p0", "relu.cl"); + // this->cl_helper_.AddKernel("relu_p1", "relu.cl"); + // const auto dim = + // const_cast(param->InputX())->ImageDims(); + // param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), + // this->cl_helper_.CLCommandQueue(), + // dim); + return true; +} + +template <> +void ReluKernel::Compute(const ReluParam& param) { + auto kernel = this->cl_helper_.KernelAt(0); + // auto kernel_p0 = this->cl_helper_.KernelAt(1); + // auto kernel_p1 = this->cl_helper_.KernelAt(2); + const auto* input = param.InputX(); + auto* output = param.Out(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + // auto tImage = + // const_cast&>(param).getMidImage().GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + work_size, NULL, 0, NULL, NULL); + // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3, + // NULL, + // work_size, NULL, 0, NULL, NULL); +} + +template class ReluKernel; + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a --- /dev/null +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/reshape_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReshapeKernel::Init(ReshapeParam *param) { + this->cl_helper_.AddKernel("reshape", "reshape.cl"); + return true; +} + +template <> +void ReshapeKernel::Compute(const ReshapeParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + const auto *input = param.InputX(); + auto *output = param.Out(); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + const auto &inputDim = input->dims(); + const auto &outputDim = output->dims(); + int dims[4] = {1, 1, 1, 1}; + int odims[4] = {1, 1, 1, 1}; + // 1 1000 1 1 + for (int i = 0; i < inputDim.size(); i++) { + dims[4 - inputDim.size() + i] = inputDim[i]; + } + + // 1 1 1 1000 + for (int i = 0; i < outputDim.size(); i++) { + odims[4 - outputDim.size() + i] = outputDim[i]; + } + clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); + clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); + clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]); + clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]); + clSetKernelArg(kernel, 6, sizeof(cl_int), &odims); + clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); + const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + work_size, NULL, 0, NULL, NULL); +} + +template class ReshapeKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22e6672ee462b963476dc72895329a9117fc16a8 --- /dev/null +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#include "operators/kernel/softmax_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool SoftmaxKernel::Init(SoftmaxParam *param) { + this->cl_helper_.AddKernel("softmax", "softmax.cl"); + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); + const auto *input = param.InputX(); + auto *output = param.Out(); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + + int group = output->ImageWidth(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + status = clSetKernelArg(kernel, 2, sizeof(int), &group); + + // const auto &inputDim = input->dims(); + // + // int dims[4] = {1, 1, 1, 1}; + // + // for (int i = 0; i < inputDim.size(); i++) { + // dims[4 - inputDim.size() + i] = inputDim[i]; + // } + // + // clSetKernelArg(kernel, 2, sizeof(int), &dims); + // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); + // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); + // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); +} + +template class SoftmaxKernel; + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h index 61100bf5f0e9de43bfb6295a0719f1be0954d128..ac9ebca4d5ab30307303b8720677e67470634b44 100644 --- a/src/operators/kernel/concat_kernel.h +++ b/src/operators/kernel/concat_kernel.h @@ -27,7 +27,7 @@ template class ConcatKernel : public framework::OpKernelBase> { public: - void Compute(const ConcatParam ¶m) const; + void Compute(const ConcatParam ¶m); bool Init(ConcatParam *param); }; diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h index 5715cd46d5a6c7e80ab5ff77ba83c7973e1db811..fadaf7564ceeb7a52215dc335135016be02bc1ab 100644 --- a/src/operators/kernel/conv_add_add_prelu_kernel.h +++ b/src/operators/kernel/conv_add_add_prelu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddAddPReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddAddPReluParam ¶m) const; + void Compute(const FusionConvAddAddPReluParam ¶m); bool Init(FusionConvAddAddPReluParam *param); }; diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h index ee73215c4688c3e604de69cda55b05e63844c0b8..7a921ecc7d0f4498cae80fbb9cea1b13e4c94101 100644 --- a/src/operators/kernel/conv_add_bn_kernel.h +++ b/src/operators/kernel/conv_add_bn_kernel.h @@ -35,7 +35,7 @@ template class ConvAddBNKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddBNParam ¶m) const; + void Compute(const FusionConvAddBNParam ¶m); bool Init(FusionConvAddBNParam *param); }; diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h index 9faaaedcf8d6f825f818ebf5121dc7685185d5d8..3f088528fc901987873038c7e1dd779dcc2019e7 100644 --- a/src/operators/kernel/conv_add_bn_relu_kernel.h +++ b/src/operators/kernel/conv_add_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddBNReluParam ¶m) const; + void Compute(const FusionConvAddBNReluParam ¶m); bool Init(FusionConvAddBNReluParam *param); }; diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h index 360cbb6775168885e9c1a25db1f9ffb9e552324b..4e9ff0853f1d502ebb4dc4ef3641d0a879f32b60 100644 --- a/src/operators/kernel/conv_add_kernel.h +++ b/src/operators/kernel/conv_add_kernel.h @@ -40,7 +40,7 @@ template class ConvAddKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddParam ¶m) const; + void Compute(const FusionConvAddParam ¶m); bool Init(FusionConvAddParam *param); }; diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h index a109f84cf09b4d0e2469a1885b902c0f70acc6c8..631982789b09c57d0d21186d0a30df7368d2955f 100644 --- a/src/operators/kernel/conv_add_prelu_kernel.h +++ b/src/operators/kernel/conv_add_prelu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddPReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddPReluParam ¶m) const; + void Compute(const FusionConvAddPReluParam ¶m); bool Init(FusionConvAddPReluParam *param); }; diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h index f33b1dc312e1d94be0c23cff55e9e6789a556bc7..e001926b361da96ec3ff76e120bc3d1ad13714fa 100644 --- a/src/operators/kernel/conv_add_relu_kernel.h +++ b/src/operators/kernel/conv_add_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddReluParam ¶m) const; + void Compute(const FusionConvAddReluParam ¶m); bool Init(FusionConvAddReluParam *param); }; diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h index 820e5f8bcbf58676e8374e575044b10fe4676efa..dcd8fecf07fbb4ea75b382f5315e24e64e26e939 100644 --- a/src/operators/kernel/conv_bn_add_relu_kernel.h +++ b/src/operators/kernel/conv_bn_add_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvBNAddReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNAddReluParam ¶m) const; + void Compute(const FusionConvBNAddReluParam ¶m); bool Init(FusionConvBNAddReluParam *param); }; diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h index f740ca836481c1331ea2e889865b3078d48644a6..e669f3bdd85dbd89e3a48d417dcd0cd6b9706062 100644 --- a/src/operators/kernel/conv_bn_kernel.h +++ b/src/operators/kernel/conv_bn_kernel.h @@ -35,7 +35,7 @@ template class ConvBNKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNParam ¶m) const; + void Compute(const FusionConvBNParam ¶m); bool Init(FusionConvBNParam *param); }; diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h index 225976aa5db31096ef691ecefa8b63d4ae3dc277..91b3413116ae22a8e212cf149c4e0c2a8924664a 100644 --- a/src/operators/kernel/conv_bn_relu_kernel.h +++ b/src/operators/kernel/conv_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNReluParam ¶m) const; + void Compute(const FusionConvBNReluParam ¶m); bool Init(FusionConvBNReluParam *param); }; diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h index 93474adaa97743d1850b53df114ae08f144aebca..cac498c36bd5debef0ff996cdf017355a2371a18 100644 --- a/src/operators/kernel/conv_kernel.h +++ b/src/operators/kernel/conv_kernel.h @@ -31,7 +31,7 @@ using framework::OpKernelBase; template class ConvKernel : public OpKernelBase> { public: - void Compute(const ConvParam ¶m) const; + void Compute(const ConvParam ¶m); bool Init(ConvParam *param); }; diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h index 761370095cae9751eb479521d6378c4f7ccaefe5..6341a87d43fdb3a3ca63fadd90239bdf2a6921a8 100644 --- a/src/operators/kernel/conv_transpose_kernel.h +++ b/src/operators/kernel/conv_transpose_kernel.h @@ -28,7 +28,7 @@ template class ConvTransposeKernel : public OpKernelBase> { public: - void Compute(const ConvTransposeParam ¶m) const; + void Compute(const ConvTransposeParam ¶m); bool Init(ConvTransposeParam *param); }; diff --git a/src/operators/kernel/crf_kernel.h b/src/operators/kernel/crf_kernel.h index 71c07cf0384d482522de3a6652c6d24a22af656a..1436aafc0603d4c7ba9ecae911f10bd8f297852a 100644 --- a/src/operators/kernel/crf_kernel.h +++ b/src/operators/kernel/crf_kernel.h @@ -28,7 +28,7 @@ template class CrfKernel : public framework::OpKernelBase> { public: - void Compute(const CrfParam& param) const; + void Compute(const CrfParam& param); bool Init(CrfParam* param); }; } // namespace operators diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h index 605b81cd6ed4ccd54b1803cf7a603b8f4576982d..3ee5bf86e97baa3970239e32b7fd5fc341e09f92 100644 --- a/src/operators/kernel/depthwise_conv_kernel.h +++ b/src/operators/kernel/depthwise_conv_kernel.h @@ -31,7 +31,7 @@ template class DepthwiseConvKernel : public OpKernelBase> { public: - void Compute(const ConvParam ¶m) const; + void Compute(const ConvParam ¶m); bool Init(ConvParam *param); }; } // namespace operators diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h index d147e3f94ab87165cceac886289e74747906e047..6ba8ec88c52f20ccfcd30d5b9a217eaef658d507 100644 --- a/src/operators/kernel/dequantize_kernel.h +++ b/src/operators/kernel/dequantize_kernel.h @@ -26,7 +26,7 @@ template class DequantizeKernel : public framework::OpKernelBase> { public: - void Compute(const DequantizeParam ¶m) const; + void Compute(const DequantizeParam ¶m); bool Init(DequantizeParam *param); }; diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h index b7535095d4fef11ee628aea96a074abcc3562f7f..2f59d01b6723eea274b1ed059ae08863a4937961 100644 --- a/src/operators/kernel/dropout_kernel.h +++ b/src/operators/kernel/dropout_kernel.h @@ -26,7 +26,7 @@ template class DropoutKernel : public framework::OpKernelBase> { public: - void Compute(const DropoutParam& param) const; + void Compute(const DropoutParam& param); bool Init(DropoutParam* para); }; } // namespace operators diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h index 594c594cb00f8f4ddd8a511f3c992c4efbfcdfc6..f2e4c0afbd0aaafff5339816764f9e30592f122c 100644 --- a/src/operators/kernel/dwconv_bn_relu_kernel.h +++ b/src/operators/kernel/dwconv_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class DWConvBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionDWConvBNReluParam ¶m) const; + void Compute(const FusionDWConvBNReluParam ¶m); bool Init(FusionDWConvBNReluParam *param); }; diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h index 67182af2e20e23c40effab6b87eefde1e0ab629d..8fa07e519ec0b78baffabd08fb7e524f8259c9eb 100644 --- a/src/operators/kernel/elementwise_add_kernel.h +++ b/src/operators/kernel/elementwise_add_kernel.h @@ -30,7 +30,7 @@ class ElementwiseAddKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseAddParam ¶m) const; + void Compute(const ElementwiseAddParam ¶m); bool Init(ElementwiseAddParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_add_relu_kernel.h b/src/operators/kernel/elementwise_add_relu_kernel.h index 5eda5a0c56c228ad54c888b6faa82ce9417f2dc1..d18c4e27fa3345b1818d0e6149fc8fb83195f644 100644 --- a/src/operators/kernel/elementwise_add_relu_kernel.h +++ b/src/operators/kernel/elementwise_add_relu_kernel.h @@ -29,7 +29,7 @@ class ElementwiseAddReluKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseAddReluParam ¶m) const; + void Compute(const ElementwiseAddReluParam ¶m); bool Init(ElementwiseAddReluParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h index 63f0df4815dc143e482140a855eb254bd016d50c..54baa50fcafb8ddbbefecb635ea85f120f16250d 100644 --- a/src/operators/kernel/elementwise_mul_kernel.h +++ b/src/operators/kernel/elementwise_mul_kernel.h @@ -28,7 +28,7 @@ class ElementwiseMulKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseMulParam ¶m) const; + void Compute(const ElementwiseMulParam ¶m); bool Init(ElementwiseMulParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_sub_kernel.h b/src/operators/kernel/elementwise_sub_kernel.h index 9516dcbd3de09debe233571eb5f60b3b8b19a2fa..89536b920837b57c4017ccadff7ea6e233cd999e 100644 --- a/src/operators/kernel/elementwise_sub_kernel.h +++ b/src/operators/kernel/elementwise_sub_kernel.h @@ -28,7 +28,7 @@ class ElementwiseSubKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseSubParam ¶m) const; + void Compute(const ElementwiseSubParam ¶m); bool Init(ElementwiseSubParam *param); }; diff --git a/src/operators/kernel/fc_relu_kernel.h b/src/operators/kernel/fc_relu_kernel.h index 6e9446da37df4ba83db85d416aa87f216816c4a5..6735a50bee86e25d9f8d091b6218a472f3838aec 100644 --- a/src/operators/kernel/fc_relu_kernel.h +++ b/src/operators/kernel/fc_relu_kernel.h @@ -28,7 +28,7 @@ class FusionFcReluKernel : public framework::OpKernelBase> { public: - void Compute(const FusionFcReluParam& param) const; + void Compute(const FusionFcReluParam& param); bool Init(FusionFcReluParam* param); }; } // namespace operators diff --git a/src/operators/kernel/feed_kernel.h b/src/operators/kernel/feed_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2b1220fee534040e5ccae5aee84adf3b4b6290b9 --- /dev/null +++ b/src/operators/kernel/feed_kernel.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using namespace framework; +template +class FeedKernel + : public framework::OpKernelBase> { + public: + void Compute(const FeedParam ¶m); + bool Init(FeedParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fetch_kernel.h b/src/operators/kernel/fetch_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..d9ed91855d0db5149cc8cf4f5d571afd1fbea98f --- /dev/null +++ b/src/operators/kernel/fetch_kernel.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using namespace framework; + +template +class FetchKernel + : public framework::OpKernelBase> { + public: + void Compute(const FetchParam ¶m); + bool Init(FetchParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/flatten_kernel.h b/src/operators/kernel/flatten_kernel.h index 80d66ccf87c21532c8b4590d992f5bccbe4f00dc..4846725bcb6522389d29e137980b9d53e63f9f32 100644 --- a/src/operators/kernel/flatten_kernel.h +++ b/src/operators/kernel/flatten_kernel.h @@ -28,7 +28,7 @@ template class FlattenKernel : public framework::OpKernelBase> { public: - void Compute(const FlattenParam& param) const; + void Compute(const FlattenParam& param); bool Init(FlattenParam* param); }; } // namespace operators diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp index f61afd4a5c514ced87396313ea5d645fe830e12a..6644bfd83e57a7fd147c0cc6383e64eb2ad79e51 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -58,7 +58,7 @@ bool ConcatKernel::Init(ConcatParam *param) { } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) const { +void ConcatKernel::Compute(const ConcatParam ¶m) { ComputeFPGAConcat(param.FpgaArgs()); } template class ConcatKernel; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 9b3944fc9a9ab308d9fe8b791a34e09651b87e6e..679a95ff54168da821ed0debb80b6bce8eca407b 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -78,7 +78,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { template <> void ConvAddBNKernel::Compute( - const FusionConvAddBNParam ¶m) const { + const FusionConvAddBNParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index 83f74e97d04eda29f3aaa6a0cc16ed7d194321d8..6c99750eb824940b32a857ee2baffc72bce05a7a 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -76,7 +76,7 @@ bool ConvAddBNReluKernel::Init( template <> void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) const { + const FusionConvAddBNReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 4975f2a905dcd76c5b7f013eafaa376dd2bb1646..ce2fbbda0ee4c7e0a1e97b45674ef269df3be3be 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -58,7 +58,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { template <> void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) const { + const FusionConvAddReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 276e71b6a44e9a7beba0d5db2f51472a9927d8da..ac9f19e411a87bb31e320df504a0e1c88e195454 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -69,8 +69,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { } template <> -void ConvBNKernel::Compute( - const FusionConvBNParam ¶m) const { +void ConvBNKernel::Compute(const FusionConvBNParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index f519a37cb57378a603969adae255f88ae8a5df2a..4c9eb391ada9366478877494fbe466d5cf919327 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -70,7 +70,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { template <> void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) const { + const FusionConvBNReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp index b0981c4254060996a16f4ae5beabb7c22edd6d34..8b990d46e0b90bf67eaf36bbf38238fd4432ace6 100644 --- a/src/operators/kernel/fpga/dropout_kernel.cpp +++ b/src/operators/kernel/fpga/dropout_kernel.cpp @@ -26,8 +26,7 @@ bool DropoutKernel::Init(DropoutParam *param) { } template <> -void DropoutKernel::Compute( - const DropoutParam ¶m) const {} +void DropoutKernel::Compute(const DropoutParam ¶m) {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp index b592dd6d59a5d5cec8f12ef304099d2b89a10a05..5253d4d0d3e00190b4ed594279d9190659ec6026 100644 --- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp @@ -56,7 +56,7 @@ bool ElementwiseAddReluKernel::Init( template <> void ElementwiseAddReluKernel::Compute( - const ElementwiseAddReluParam ¶m) const { + const ElementwiseAddReluParam ¶m) { fpga::ComputeFpgaEWAdd(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 52d7c0a4e69080e11f86d1507829e7e779a69228..2c6b616689dca14474d1cbdc3769b438de1358e4 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -61,7 +61,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { } template <> void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) const { + const FusionFcReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/feed_kernel.cpp b/src/operators/kernel/fpga/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..161d8c9f0cf22ac79d1367e07b8ba3318a7a7123 --- /dev/null +++ b/src/operators/kernel/fpga/feed_kernel.cpp @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto input = + reinterpret_cast(const_cast(param.InputX())); + auto input_ptr = input->data(); + fpga::format_image(input); + Tensor *output = param.Out(); + auto output_ptr = output->data(); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = reinterpret_cast(input_ptr); + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); +} +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/fetch_kernel.cpp b/src/operators/kernel/fpga/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e56ac301271f120d888a4feea11122f40885633e --- /dev/null +++ b/src/operators/kernel/fpga/fetch_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/fetch_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 407e14238d542604e876ced624d5a0db698a6101..9258fb90e1e6bf9a597a387843ce781858628139 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -62,8 +62,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { } template <> -void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { +void FusionFcKernel::Compute(const FusionFcParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index 6269506836c25d756040cd25cf9b0189fd03d89b..c8179913c58351749bfdb5cc154d1eba2453e079 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -53,7 +53,7 @@ bool PoolKernel::Init(PoolParam *param) { } template <> -void PoolKernel::Compute(const PoolParam ¶m) const { +void PoolKernel::Compute(const PoolParam ¶m) { fpga::ComputeFpgaPool(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index e36db57f4b4f18712df50b2b132cdd1032a41921..5fbe1a9e07da7eb71b088f84efbc6606b9bc5ff9 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -47,8 +47,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { } template <> -void SoftmaxKernel::Compute( - const SoftmaxParam ¶m) const { +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *in_x = param.FloatInput(); Tensor *out = param.Out(); diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h index 06d3981bd23708aee982e38d82ba592d69733a89..b8086bc66fbef7ec952548a3cb863cfa031c504e 100644 --- a/src/operators/kernel/fusion_fc_kernel.h +++ b/src/operators/kernel/fusion_fc_kernel.h @@ -27,7 +27,7 @@ template class FusionFcKernel : public framework::OpKernelBase> { public: - void Compute(const FusionFcParam& param) const; + void Compute(const FusionFcParam& param); bool Init(FusionFcParam* param); }; diff --git a/src/operators/kernel/gru_kernel.h b/src/operators/kernel/gru_kernel.h index 6b02663bd0e2982bdb2480c54632d2a8da9f67fc..b03b2e3ecb514fdf962bde9c06620fa6e64934df 100644 --- a/src/operators/kernel/gru_kernel.h +++ b/src/operators/kernel/gru_kernel.h @@ -28,7 +28,7 @@ template class GruKernel : public framework::OpKernelBase> { public: - void Compute(const GruParam& param) const; + void Compute(const GruParam& param); bool Init(GruParam* param); }; } // namespace operators diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h index df93ea5abacda1a5291caa53dc5dae7ea2b5d710..b15eb68996a990f6bc770db6940be83a0eea0cbf 100644 --- a/src/operators/kernel/im2sequence_kernel.h +++ b/src/operators/kernel/im2sequence_kernel.h @@ -29,7 +29,7 @@ template class Im2SequenceKernel : public framework::OpKernelBase> { public: - void Compute(const Im2SequenceParam& param) const; + void Compute(const Im2SequenceParam& param); bool Init(Im2SequenceParam* para); }; } // namespace operators diff --git a/src/operators/kernel/lookup_kernel.h b/src/operators/kernel/lookup_kernel.h index 73f6cfcced078382b40526eae1f6560d7d168b97..8c29349e737b0fba95688e1ebb8fe893a29b2a4f 100644 --- a/src/operators/kernel/lookup_kernel.h +++ b/src/operators/kernel/lookup_kernel.h @@ -28,7 +28,7 @@ template class LookupKernel : public framework::OpKernelBase> { public: - void Compute(const LookupParam& param) const; + void Compute(const LookupParam& param); bool Init(LookupParam* param); }; } // namespace operators diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h index 164178f1dcc0ee2523fc9c5fdc4736c14a3e55ce..99dbfe2d658cde17e6399f8ea4bc5b945092cde5 100644 --- a/src/operators/kernel/lrn_kernel.h +++ b/src/operators/kernel/lrn_kernel.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #ifdef LRN_OP #ifdef _OPENMP #include @@ -173,7 +175,7 @@ template class LrnKernel : public framework::OpKernelBase> { public: - void Compute(const LrnParam ¶m) const; + void Compute(const LrnParam ¶m); bool Init(LrnParam *param); }; } // namespace operators diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp index 50f6ef5f566347c089869c30b8f7534a4f8b6779..5d50ca9a7250f66f20b6bfaf0d93db18014d791c 100755 --- a/src/operators/kernel/mali/batchnorm_kernel.cpp +++ b/src/operators/kernel/mali/batchnorm_kernel.cpp @@ -145,7 +145,7 @@ bool BatchNormKernel::Init(BatchNormParam* param) { template <> void BatchNormKernel::Compute( - const BatchNormParam& param) const { + const BatchNormParam& param) { std::cout << "init acl" << std::endl; AclBatchNormOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp index 267c0101a8f66de3d508dbe5795c87ee5027a288..2fb05ab10eccf4e0dca9c74bbcc83067b438e981 100644 --- a/src/operators/kernel/mali/concat_kernel.cpp +++ b/src/operators/kernel/mali/concat_kernel.cpp @@ -118,7 +118,7 @@ bool ConcatKernel::Init(ConcatParam* param) { template <> void ConcatKernel::Compute( - const ConcatParam& param) const { + const ConcatParam& param) { std::cout << "init acl" << std::endl; AclConcatOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp index 74cace00dd2dead7a5d9ddfc76e2d48c67cccf89..427bcd596f71bf434ea155d04f192c5bdedfded5 100644 --- a/src/operators/kernel/mali/conv_add_kernel.cpp +++ b/src/operators/kernel/mali/conv_add_kernel.cpp @@ -212,7 +212,7 @@ bool ConvAddKernel::Init(FusionConvAddParam* param) { template <> void ConvAddKernel::Compute( - const FusionConvAddParam& param) const { + const FusionConvAddParam& param) { std::cout << "init acl" << std::endl; AclConvAddOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp index 7852e64990e5a2cd6f3d7e803e71c23c55aa7a27..7cca16274ecc7ae1707f8d5ed8faf2fde810ab30 100644 --- a/src/operators/kernel/mali/conv_kernel.cpp +++ b/src/operators/kernel/mali/conv_kernel.cpp @@ -211,8 +211,7 @@ bool ConvKernel::Init(ConvParam* param) { } template <> -void ConvKernel::Compute( - const ConvParam& param) const { +void ConvKernel::Compute(const ConvParam& param) { std::cout << "init acl" << std::endl; AclConvOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp index 5596476e1bb33ecc2b3122bf237090b099307156..3711a946b508c9ad71f59dd85f2e01c99bccc9e5 100644 --- a/src/operators/kernel/mali/elementwise_add_kernel.cpp +++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp @@ -34,7 +34,7 @@ bool ElementwiseAddKernel::Init( template <> void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) const { + const ElementwiseAddParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); Tensor *Out = param.Out(); diff --git a/src/operators/kernel/mali/feed_kernel.cpp b/src/operators/kernel/mali/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6af6c1a88b8031da4a23dad1d3269935ce81b9a8 --- /dev/null +++ b/src/operators/kernel/mali/feed_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); + param.Out()->set_lod(param.InputX()->lod()); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/fetch_kernel.cpp b/src/operators/kernel/mali/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f74280cfb322b8135d99ca7fb7e2652a08588bb3 --- /dev/null +++ b/src/operators/kernel/mali/fetch_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/fetch_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp index c3197f38c6c6ee1a4f4f684c824a9a9e43d69d4f..5e59215834ce00e902deb19e54e149b3b4cfb8ac 100755 --- a/src/operators/kernel/mali/fushion_fc_kernel.cpp +++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp @@ -26,7 +26,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { template <> void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { + const FusionFcParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); const Tensor *input_z = param.InputZ(); diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp index fc088f735c538bedc4d5c79593aa31c48acc4fc6..b46c9680d576ead3e7ab309c08894654a9fad04a 100644 --- a/src/operators/kernel/mali/lrn_kernel.cpp +++ b/src/operators/kernel/mali/lrn_kernel.cpp @@ -127,8 +127,7 @@ bool LrnKernel::Init(LrnParam* param) { } template <> -void LrnKernel::Compute( - const LrnParam& param) const { +void LrnKernel::Compute(const LrnParam& param) { std::cout << "init acl" << std::endl; AclLrnOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp index a9e54dad2b51c595be4f68df3916a4803047617e..da69f5e6fe5a4ec95373011d360cd4d9e20a8a61 100644 --- a/src/operators/kernel/mali/mul_kernel.cpp +++ b/src/operators/kernel/mali/mul_kernel.cpp @@ -27,8 +27,7 @@ bool MulKernel::Init(MulParam *param) { } template <> -void MulKernel::Compute( - const MulParam ¶m) const { +void MulKernel::Compute(const MulParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); Tensor *out = param.Out(); diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp index 33b3bd7017739144a519bfb1be247b4751883779..ec5d35a8f600d63a623b468c9c97c3540bf9c3f7 100644 --- a/src/operators/kernel/mali/pool_kernel.cpp +++ b/src/operators/kernel/mali/pool_kernel.cpp @@ -195,8 +195,7 @@ bool PoolKernel::Init(PoolParam* param) { } template <> -void PoolKernel::Compute( - const PoolParam& param) const { +void PoolKernel::Compute(const PoolParam& param) { std::cout << "init acl" << std::endl; AclPoolOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp index 10b270800dee1a0ad8176da1f788100d29b60173..68bb52af3ab9b262218223d971b044edd759b347 100644 --- a/src/operators/kernel/mali/relu_kernel.cpp +++ b/src/operators/kernel/mali/relu_kernel.cpp @@ -115,8 +115,7 @@ bool ReluKernel::Init(ReluParam* param) { } template <> -void ReluKernel::Compute( - const ReluParam& param) const { +void ReluKernel::Compute(const ReluParam& param) { std::cout << "init acl" << std::endl; AclReluOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp index 69c077e252162017cb477a000b5f17f5a968fc10..f98906c0a982c10896e75101eaa2732d75d6cdf4 100644 --- a/src/operators/kernel/mali/reshape_kernel.cpp +++ b/src/operators/kernel/mali/reshape_kernel.cpp @@ -28,7 +28,7 @@ bool ReshapeKernel::Init(ReshapeParam *param) { template <> void ReshapeKernel::Compute( - const ReshapeParam ¶m) const { + const ReshapeParam ¶m) { const auto *input_x = param.InputX(); const auto &input_x_dims = input_x->dims(); auto *out = param.Out(); diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp index d4f25c96cc47d7baa394645d4e0c84e0e3f7ad29..d6ce1ecb61c2790c68883231eb6b90dcde43a956 100644 --- a/src/operators/kernel/mali/softmax_kernel.cpp +++ b/src/operators/kernel/mali/softmax_kernel.cpp @@ -113,7 +113,7 @@ bool SoftmaxKernel::Init(SoftmaxParam* param) { template <> void SoftmaxKernel::Compute( - const SoftmaxParam& param) const { + const SoftmaxParam& param) { std::cout << "init acl" << std::endl; AclSoftmaxOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h index e441de4d4495b736aec248c0ef85191b32bfcbf9..8deb4a2cb74786257ddfc12c805c4a7d56589bbf 100644 --- a/src/operators/kernel/mul_kernel.h +++ b/src/operators/kernel/mul_kernel.h @@ -29,7 +29,7 @@ template class MulKernel : public framework::OpKernelBase> { public: - void Compute(const MulParam ¶m) const; + void Compute(const MulParam ¶m); bool Init(MulParam *param); }; } // namespace operators diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h index b1b20ddd81b395ea94ae62b1abf2fe861d9257db..6a4ac0c22941aa364f05e38c7abaf29948cd324b 100644 --- a/src/operators/kernel/multiclass_nms_kernel.h +++ b/src/operators/kernel/multiclass_nms_kernel.h @@ -28,7 +28,7 @@ class MultiClassNMSKernel : public framework::OpKernelBase> { public: - void Compute(const MultiClassNMSParam& param) const; + void Compute(const MultiClassNMSParam& param); bool Init(MultiClassNMSParam* param); }; } // namespace operators diff --git a/src/operators/kernel/polygon_box_transform_kernel.h b/src/operators/kernel/polygon_box_transform_kernel.h index d5baf32cc7dca0aee1eb0b7c13895e806f70320a..6ed003a4c794e7293ae3506909a779f95a677579 100644 --- a/src/operators/kernel/polygon_box_transform_kernel.h +++ b/src/operators/kernel/polygon_box_transform_kernel.h @@ -27,7 +27,7 @@ class PolygonBoxTransformKernel : public framework::OpKernelBase> { public: - void Compute(const PolygonBoxTransformParam& param) const; + void Compute(const PolygonBoxTransformParam& param); bool Init(PolygonBoxTransformParam* param); }; } // namespace operators diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h index 2be254444cc410fb95a94125cccb224ca9505545..ff80e0e44536d924026dbbe80a09677c069a8f6b 100644 --- a/src/operators/kernel/pool_kernel.h +++ b/src/operators/kernel/pool_kernel.h @@ -26,7 +26,7 @@ using framework::OpKernelBase; template class PoolKernel : public OpKernelBase> { public: - void Compute(const PoolParam ¶m) const override; + void Compute(const PoolParam ¶m); bool Init(PoolParam *param); }; } // namespace operators diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h index f6c7c3ac7f139cf7eafe8843ef48e53c90292082..c043149243f21f2abceeed37c5d0e81a61e5059f 100644 --- a/src/operators/kernel/prelu_kernel.h +++ b/src/operators/kernel/prelu_kernel.h @@ -24,7 +24,7 @@ template class PReluKernel : public framework::OpKernelBase> { public: - void Compute(const PReluParam& param) const; + void Compute(const PReluParam& param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h index 5640375483d42d52965986dab6795254bbf4b908..921d5901a8f24abab61f7aa94663385d91e597a7 100644 --- a/src/operators/kernel/prior_box_kernel.h +++ b/src/operators/kernel/prior_box_kernel.h @@ -54,7 +54,7 @@ template class PriorBoxKernel : public framework::OpKernelBase> { public: - void Compute(const PriorBoxParam& param) const; + void Compute(const PriorBoxParam& param); bool Init(PriorBoxParam* param); }; } // namespace operators diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h index c55ca2182acd0f459c785f29d359ea9039a7350a..d864e00d9c80003d06d460f85b6fddda40e6d607 100644 --- a/src/operators/kernel/quantize_kernel.h +++ b/src/operators/kernel/quantize_kernel.h @@ -26,7 +26,7 @@ template class QuantizeKernel : public framework::OpKernelBase> { public: - void Compute(const QuantizeParam ¶m) const; + void Compute(const QuantizeParam ¶m); bool Init(QuantizeParam *param); }; diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h index b0c32791d626f14b0840ce1c8f3f12f02b403d97..48f47c2de6df8d3aa9461fba915fd1a6406d4b9f 100644 --- a/src/operators/kernel/relu_kernel.h +++ b/src/operators/kernel/relu_kernel.h @@ -27,7 +27,7 @@ template class ReluKernel : public framework::OpKernelBase> { public: - void Compute(const ReluParam& param) const; + void Compute(const ReluParam& param); bool Init(ReluParam* param); }; } // namespace operators diff --git a/src/operators/kernel/reshape2_kernel.h b/src/operators/kernel/reshape2_kernel.h index 8d15a619d314e3f5d3085a34cff503e286b5ee37..c6ab3cf72a29612249d0ff08e56ef60ca30d59a8 100644 --- a/src/operators/kernel/reshape2_kernel.h +++ b/src/operators/kernel/reshape2_kernel.h @@ -27,7 +27,7 @@ template class Reshape2Kernel : public framework::OpKernelBase> { public: - void Compute(const Reshape2Param& param) const; + void Compute(const Reshape2Param& param); bool Init(Reshape2Param* param); }; } // namespace operators diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h index 73eb63f797f34ec4eb2baec8c4ab79fafb06f0e2..a5405654874320cdfe3432d16d3a8c6358d2d8e1 100644 --- a/src/operators/kernel/reshape_kernel.h +++ b/src/operators/kernel/reshape_kernel.h @@ -71,7 +71,7 @@ template class ReshapeKernel : public framework::OpKernelBase> { public: - void Compute(const ReshapeParam& param) const; + void Compute(const ReshapeParam& param); bool Init(ReshapeParam* param); }; } // namespace operators diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h index 7102d2f4bc9bc64d53fa40697cf2b7a68d8be566..b25a0dcef5d291f03e4bb1a127eb0b592ee89055 100644 --- a/src/operators/kernel/resize_kernel.h +++ b/src/operators/kernel/resize_kernel.h @@ -74,7 +74,7 @@ template class ResizeKernel : public framework::OpKernelBase> { public: - void Compute(const ResizeParam ¶m) const; + void Compute(const ResizeParam ¶m); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h index 2da92d8d3c8b0d7867e7e6e628a04a853dd69464..a17e57652224992b2ee7127e6081804bf3253fb1 100644 --- a/src/operators/kernel/scale_kernel.h +++ b/src/operators/kernel/scale_kernel.h @@ -24,7 +24,7 @@ template class ScaleKernel : public framework::OpKernelBase> { public: - void Compute(const ScaleParam& param) const; + void Compute(const ScaleParam& param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/shape_kernel.h b/src/operators/kernel/shape_kernel.h index 7caf3e427a4f3b469265248708a3090c52d1ca91..9d3c6e1701523acc43410fb0e3402b5679d4f19a 100644 --- a/src/operators/kernel/shape_kernel.h +++ b/src/operators/kernel/shape_kernel.h @@ -28,7 +28,7 @@ template class ShapeKernel : public framework::OpKernelBase> { public: - void Compute(const ShapeParam& param) const; + void Compute(const ShapeParam& param); bool Init(ShapeParam* param); }; } // namespace operators diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h index e68f215b00aa2f9faba850853efe4896752a8f7b..db9fc3dd3cb1e6c0eb56cd5a14a173f5a031263c 100644 --- a/src/operators/kernel/sigmoid_kernel.h +++ b/src/operators/kernel/sigmoid_kernel.h @@ -28,7 +28,7 @@ template class SigmoidKernel : public OpKernelBase> { public: - void Compute(const SigmoidParam& param) const override; + void Compute(const SigmoidParam& param); bool Init(SigmoidParam* param); }; diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h index 17f7fe4a9ebf5b78fc92c41abd4756a7bc6bff45..6ae6528622b37c2f2694d70da3e74540e3404c99 100644 --- a/src/operators/kernel/slice_kernel.h +++ b/src/operators/kernel/slice_kernel.h @@ -24,7 +24,7 @@ template class SliceKernel : public framework::OpKernelBase> { public: - void Compute(const SliceParam& param) const {} + void Compute(const SliceParam& param) {} }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h index 67bd9167e8c717355fc326d3025cde410ce66010..d7d7435fd5145e702de848872f93087188fd31fc 100644 --- a/src/operators/kernel/softmax_kernel.h +++ b/src/operators/kernel/softmax_kernel.h @@ -27,7 +27,7 @@ template class SoftmaxKernel : public OpKernelBase> { public: - void Compute(const SoftmaxParam ¶m) const override; + void Compute(const SoftmaxParam ¶m); bool Init(SoftmaxParam *param); }; } // namespace operators diff --git a/src/operators/kernel/split_kernel.h b/src/operators/kernel/split_kernel.h index 03a418de59606e42684c67ca3053fa8e39b07940..3a2c03dce718e650ebf9127044f0db44d9d5c9a5 100644 --- a/src/operators/kernel/split_kernel.h +++ b/src/operators/kernel/split_kernel.h @@ -28,7 +28,7 @@ template class SplitKernel : public framework::OpKernelBase> { public: - void Compute(const SplitParam& param) const; + void Compute(const SplitParam& param); bool Init(SplitParam* param); }; } // namespace operators diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h index ed337432e0fd4bf4035b67d4099379ce29918547..967d6f8307beb90254c431beaf324e891898d1a0 100644 --- a/src/operators/kernel/sum_kernel.h +++ b/src/operators/kernel/sum_kernel.h @@ -25,7 +25,7 @@ template class SumKernel : public framework::OpKernelBase> { public: - void Compute(const SumParam ¶m) const; + void Compute(const SumParam ¶m); bool Init(SumParam *param); }; diff --git a/src/operators/kernel/transpose2_kernel.h b/src/operators/kernel/transpose2_kernel.h index 8ae75ea483ddb887d9c53b32228ff72b41c76097..a1fb186db09520bed6f891ef9381d96a06f648c9 100644 --- a/src/operators/kernel/transpose2_kernel.h +++ b/src/operators/kernel/transpose2_kernel.h @@ -28,7 +28,7 @@ template class Transpose2Kernel : public framework::OpKernelBase> { public: - void Compute(const Transpose2Param& param) const; + void Compute(const Transpose2Param& param); bool Init(Transpose2Param* param); }; } // namespace operators diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h index 56c41fd221e080a4db3b34fbd4ab208c9986c2a8..63ee6eb172ff691ff51dd3f74613cd3e412210bf 100644 --- a/src/operators/kernel/transpose_kernel.h +++ b/src/operators/kernel/transpose_kernel.h @@ -28,7 +28,7 @@ template class TransposeKernel : public framework::OpKernelBase> { public: - void Compute(const TransposeParam& param) const; + void Compute(const TransposeParam& param); bool Init(TransposeParam* param); }; } // namespace operators diff --git a/src/operators/lookup_op.h b/src/operators/lookup_op.h index 073e884e9157644670259b5acdb47443d2333e03..b5c3886cf46c9641e919aee32e7af30c6528309a 100644 --- a/src/operators/lookup_op.h +++ b/src/operators/lookup_op.h @@ -37,10 +37,6 @@ class LookupOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::LookupKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, LookupParam, - operators::LookupKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h index 26415a84aa96abdab91da7508080ce6a095aca62..3e1e92bfe6d9b888f100d07edaabfe0f8c6eaca5 100644 --- a/src/operators/lrn_op.h +++ b/src/operators/lrn_op.h @@ -35,10 +35,6 @@ class LrnOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::LrnKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, LrnParam, - operators::LrnKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h index 5cd174db07973461fe699242a2013d9c4ea78732..51e828202e8da2080f014eff2bd60472dd873884 100644 --- a/src/operators/mul_op.h +++ b/src/operators/mul_op.h @@ -35,10 +35,6 @@ class MulOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::MulKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, MulParam, - operators::MulKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h index 4919ec69b6b5b1a702760f46ddbfc77b16c7875e..059974ab214004bcd1423514c85353da9a9bb6b8 100644 --- a/src/operators/multiclass_nms_op.h +++ b/src/operators/multiclass_nms_op.h @@ -40,10 +40,6 @@ class MultiClassNMSOp : public framework::OperatorWithKernel< DeviceType, MultiClassNMSParam, operators::MultiClassNMSKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, MultiClassNMSParam, - operators::MultiClassNMSKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 2c0075271a92cb66ef95603965dd18d0dd3c5faf..7fe5135f8e233b69231c9e56033ec4f8da49a032 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -27,6 +27,10 @@ limitations under the License. */ #include "fpga/api.h" #endif +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" +#endif + namespace paddle_mobile { namespace operators { @@ -48,6 +52,17 @@ struct DtypeTensorTrait { typedef framework::Tensor rtype; }; +#ifdef PADDLE_MOBILE_CL +template <> +struct DtypeTensorTrait { + // This is the type we obtained in variable. + typedef framework::CLImage gtype; + // This type will be the parent class type + // or the same type. + typedef framework::CLImage rtype; +}; +#endif + class OpParam { protected: template @@ -397,6 +412,13 @@ class ConvParam : public OpParam { const int &Groups() const { return groups; } +#ifdef PADDLE_MOBILE_CL + int Offset() const { return offset_; } + + int SetOffset(int in_offset) { offset_ = in_offset; } + +#endif + private: RType *input_; RType *output_; @@ -405,6 +427,10 @@ class ConvParam : public OpParam { vector paddings_; vector dilations_; int groups; + +#ifdef PADDLE_MOBILE_CL + int offset_; +#endif }; template Print &operator<<(Print &printer, const ConvParam &conv_param); @@ -715,6 +741,14 @@ class BatchNormParam : OpParam { const string &DataFormat() const { return data_format_; } + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + private: RType *input_x_; RType *output_y_; @@ -726,6 +760,8 @@ class BatchNormParam : OpParam { float momentum_; bool is_test_; string data_format_; + RType *new_bias_; + RType *new_scale_; }; #endif @@ -1034,18 +1070,18 @@ class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - auto var = scope->Var("batch_size"); + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + auto var = scope.FindVar("batch_size"); batch_size = var->GetValue(); } - const GType *InputX() const { return input_x_; } + const LoDTensor *InputX() const { return input_x_; } GType *Out() const { return out_; } const int BatchSize() const { return batch_size; } private: - GType *input_x_; + LoDTensor *input_x_; GType *out_; int batch_size; }; @@ -1059,14 +1095,19 @@ class FetchParam : public OpParam { FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + out_ = OutFrom(outputs, scope); } + const RType *InputX() const { return input_x_; } - RType *Out() const { return out_; } + Tensor *Out() const { return out_; } + + static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) { + return GetVarValue("Out", outputs, scope); + } private: RType *input_x_; - RType *out_; + Tensor *out_; }; #ifdef FILL_CONSTANT_OP @@ -1447,13 +1488,13 @@ class ResizeParam : public OpParam { * @b op 层实例化好这个 param 传递给 kernel 层使用 * */ template -class ReluParam : public OpParam { +class ReluParamBase : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: - ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); } @@ -1466,6 +1507,25 @@ class ReluParam : public OpParam { RType *input_x_; RType *out_; }; + +template +class ReluParam : public ReluParamBase { + public: + using ReluParamBase::ReluParamBase; +}; + +#ifdef PADDLE_MOBILE_CL +template <> +class ReluParam : public ReluParamBase { + public: + using ReluParamBase::ReluParamBase; + framework::CLImage &getMidImage() { return midImage; } + + private: + framework::CLImage midImage; +}; +#endif + #endif #ifdef PRELU_OP @@ -1764,6 +1824,7 @@ class FusionConvAddBNReluParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; + #ifdef PADDLE_MOBILE_FPGA private: diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp index dd23059ea01a332aff45137b7f7ed4c9f6c2e1bb..241f278ec0c5dd10e103b3ab1aa6f296323eebce 100644 --- a/src/operators/pool_op.cpp +++ b/src/operators/pool_op.cpp @@ -14,7 +14,8 @@ limitations under the License. */ #ifdef POOL_OP -#include "pool_op.h" +#include "operators/pool_op.h" +#include #include "framework/op_proto_maker.h" #include "framework/op_registry.h" @@ -68,5 +69,8 @@ REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(pool2d, ops::PoolOp); +#endif #endif diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h index 9880599ce5fc71048d6a555b3fa4848c5d7a8220..8f3957e29ee0802576f604900f8d15f86a864d53 100644 --- a/src/operators/pool_op.h +++ b/src/operators/pool_op.h @@ -38,9 +38,6 @@ class PoolOp : public OperatorWithKernel, : OperatorWithKernel, operators::PoolKernel>( type, inputs, outputs, attrs, scope) {} - using OperatorWithKernel< - DeviceType, PoolParam, - operators::PoolKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h index af33476b7298a5728a6ef944506d55f422a2fa8c..5d0458f896941ece4208ca4b4931db189b4f436e 100644 --- a/src/operators/prelu_op.h +++ b/src/operators/prelu_op.h @@ -38,10 +38,6 @@ class PReluOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::PReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, PReluParam, - operators::PReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h index f7e02802ae82368319d5e9095c73afcac295b4fc..f7e26430a0536cde011de14f670a9f46b8f517c1 100644 --- a/src/operators/prior_box_op.h +++ b/src/operators/prior_box_op.h @@ -40,9 +40,6 @@ class PriorBoxOp : public framework::OperatorWithKernel< operators::PriorBoxKernel>( type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, PriorBoxParam, - operators::PriorBoxKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp index 933e1cfce064d63664ebc35b7ac331d4f32b74b9..d6d83475ee7879f8bc967439dac2094df12c8617 100644 --- a/src/operators/relu_op.cpp +++ b/src/operators/relu_op.cpp @@ -39,5 +39,10 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); #endif +#ifdef PADDLE_MOBILE_FPGA +#endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(relu, ops::ReluOp); +#endif #endif diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h index 584c9da3c80c4e3e9e69fdb70a602cdd486e26b8..1c94a7f6d71484d0a4bd14e89d8518f6e73a660b 100644 --- a/src/operators/relu_op.h +++ b/src/operators/relu_op.h @@ -41,10 +41,6 @@ class ReluOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ReluParam, - operators::ReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp index 214007545844e19cf698c6294416a6501a595b58..8ceb157d28764de469e5de5108ad483387ba8ca9 100644 --- a/src/operators/reshape_op.cpp +++ b/src/operators/reshape_op.cpp @@ -38,5 +38,8 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp); +#endif #endif diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h index a7347ddd8c6511224d4422f66eac71e61bf48549..3109303ff0e6007d0dbec133102924ff7bb30306 100644 --- a/src/operators/reshape_op.h +++ b/src/operators/reshape_op.h @@ -39,10 +39,6 @@ class ReshapeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ReshapeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ReshapeParam, - operators::ReshapeKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h index c0b38bb1cf4048af4b07d05f28a88a5ac8056ea3..954b3a82f8d2b5ccba242045c3d5e0f28553d484 100644 --- a/src/operators/resize_op.h +++ b/src/operators/resize_op.h @@ -38,10 +38,6 @@ class ResizeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ResizeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ResizeParam, - operators::ResizeKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/scale_op.h b/src/operators/scale_op.h index 4c5f5e620f25bef88533e80cdd78b243fef9bc70..56265259fe3a10feda67cc5c5732b2ba44e0730e 100644 --- a/src/operators/scale_op.h +++ b/src/operators/scale_op.h @@ -38,10 +38,6 @@ class ScaleOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ScaleKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ScaleParam, - operators::ScaleKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/shape_op.h b/src/operators/shape_op.h index 37b4fef1f4667051e51adbd96d6ada36bf36b647..116751c48e9ca3cc9ec936b1bcbaa72b6950bbc5 100644 --- a/src/operators/shape_op.h +++ b/src/operators/shape_op.h @@ -38,10 +38,6 @@ class ShapeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ShapeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ShapeParam, - operators::ShapeKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h index 62fc65dce1025fff629dd81ea4a7f797ded1a1d6..7150a8a473e4cb1dba7230d63799bd263ef19812 100644 --- a/src/operators/sigmoid_op.h +++ b/src/operators/sigmoid_op.h @@ -36,11 +36,6 @@ class SigmoidOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SigmoidKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SigmoidParam, - operators::SigmoidKernel>::OperatorWithKernel; - void InferShape() const override; }; diff --git a/src/operators/slice_op.h b/src/operators/slice_op.h index 6bcb6fa0b9e88cefb3c88dfc096e1073ad261c1b..c45061696577dbe6948fb9cab7edebbaf8e15f2f 100644 --- a/src/operators/slice_op.h +++ b/src/operators/slice_op.h @@ -38,10 +38,6 @@ class SliceOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SliceKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SliceParam, - operators::SliceKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index e85edc69c3291c794f2eeb8119b91b2926c4d870..e605864706a6c59a35205b3072dd432b009c5d1f 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -36,5 +36,8 @@ REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp); +#endif #endif diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h index cee5993174a02f610c1de0ad47ca6b73477fd946..422213feeaf2bc2301832de2f9c69827342a5062 100644 --- a/src/operators/softmax_op.h +++ b/src/operators/softmax_op.h @@ -36,11 +36,6 @@ class SoftmaxOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SoftmaxKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SoftmaxParam, - operators::SoftmaxKernel>::OperatorWithKernel; - void InferShape() const override; private: diff --git a/src/operators/split_op.h b/src/operators/split_op.h index d37bf7a0f93005a4c95e7e82c7c90313fda409cb..fc733c18520b971107e00003b3107b8c0aa9b36d 100644 --- a/src/operators/split_op.h +++ b/src/operators/split_op.h @@ -38,10 +38,6 @@ class SplitOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SplitKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SplitParam, - operators::SplitKernel>::OperatorWithKernel; void InferShape() const override; }; } // namespace operators diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h index 7e5f72058d4e06f5b5b1fef81ade0350ea78f21c..eb98ce235491632aa1149acc158552955c2c1e0c 100644 --- a/src/operators/transpose_op.h +++ b/src/operators/transpose_op.h @@ -40,10 +40,6 @@ class TransposeOp : public framework::OperatorWithKernel< DeviceType, TransposeParam, operators::TransposeKernel>(type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, TransposeParam, - operators::TransposeKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2bd7169533f637add2a752feaceca8df132cb262..569b3e54e64af74b9b2de9d84217785f0053f8cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -342,6 +342,13 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) target_link_libraries(test-fssd paddle-mobile) + # gen test + ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) + target_link_libraries(test-mobilenetgpu paddle-mobile) + + # gen test + ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-yologpu paddle-mobile) # gen test ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h) @@ -351,6 +358,5 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h) target_link_libraries(test-eng paddle-mobile) - #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) endif () diff --git a/test/executor_for_test.h b/test/executor_for_test.h index 60f1856bb9294c6f9b4bd5cfb7d44f984c6f0794..970eff2400a1806c4db96cb6112c4d64dfc7eb3b 100644 --- a/test/executor_for_test.h +++ b/test/executor_for_test.h @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "common/log.h" +#include "framework/executor.h" #include "framework/op_registry.h" -#include "io/executor.h" #include "operators/conv_op.h" #include "operators/elementwise_add_op.h" #include "operators/pool_op.h" @@ -29,9 +29,9 @@ limitations under the License. */ #include "operators/softmax_op.h" #include "operators/transpose_op.h" -using paddle_mobile::Executor; using paddle_mobile::framework::BlockDesc; using paddle_mobile::framework::DDim; +using paddle_mobile::framework::Executor; using paddle_mobile::framework::LoDTensor; using paddle_mobile::framework::OpDesc; using paddle_mobile::framework::Program; diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp index 5d1a5828b36b3d9ed371a271af6db82657ff1596..44b9f4971bbd5cc69e1f663ae71e27e69c31a04b 100644 --- a/test/fpga/test_concat_op.cpp +++ b/test/fpga/test_concat_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/concat_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index 64fa42658be6b39fabe9bb26296a426949d31197..3d1b6af935b2f3e7f0c60f5c0cbbcc696f6aeba2 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "../test_helper.h" -#include "io/loader.h" +#include "framework/loader.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../../../test/models/googlenet // ../../../test/models/mobilenet - // auto program = loader.Load(g_googlenet, true); // auto program = loader.Load(g_mobilenet_ssd, true); - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params", false); + // auto program = loader.Load(std::string(g_ocr) + "/model", + // std::string(g_ocr) + "/params", false); // program.originProgram->Description("program desc: "); + return 0; } diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp index 3cae963eca048da221d69c4c336dd4fdfecbb584..0392020789096e921865afed0b0fc51fa5999c6b 100644 --- a/test/framework/test_optimize.cpp +++ b/test/framework/test_optimize.cpp @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "../test_helper.h" +#include "framework/loader.h" #include "framework/program/program-optimize/node.h" #include "framework/program/program-optimize/program_optimize.h" -#include "io/loader.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // "../../../test/models/googlenet" auto program = loader.Load(g_mobilenet_ssd, true); paddle_mobile::framework::ProgramOptimize optimize; diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index f7d29942224b51734cf62988ba8f271f1fa05bc3..527f2067496eac1df1e0fb10d1dfd2ca66fe4cfd 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -29,8 +29,9 @@ int main() { bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; std::vector input; std::vector output; std::vector dims{1, 3, 224, 224}; diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 4ed7d3b756cfef9554028e1d33f4dd86bf58e4b8..5cce53e866df0530d6c8e1f35bc7159ba6e5ba9b 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -19,14 +19,15 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; paddle_mobile.SetThreadNum(4); - auto time1 = time(); + auto time1 = paddle_mobile::time(); // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // std::string(g_mobilenet_detect) + "/params", true); auto isok = paddle_mobile.Load(g_mobilenet, true); if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" + << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; @@ -42,14 +43,14 @@ int main() { for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } - auto time3 = time(); + auto time3 = paddle_mobile::time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } DLOG << vec_result; - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; + auto time4 = paddle_mobile::time(); + std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 + << "ms" << std::endl; } std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a5276d6e521855ad81e6b9e2edb58c271ae713d9 --- /dev/null +++ b/test/net/test_mobilenet_GPU.cpp @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + auto time1 = paddle_mobile::time(); + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", + // std::string(g_mobilenet_detect) + "/params", true); + + auto isok = paddle_mobile.Load(std::string(g_mobilenet), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + std::vector vec_result = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + } + + std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " + "是否存在?" + << std::endl; + return 0; +} diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b00cbef0277f44c65ab951227176721599b0559e --- /dev/null +++ b/test/net/test_yologpu.cpp @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + auto time1 = paddle_mobile::time(); + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", + // std::string(g_mobilenet_detect) + "/params", true); + + auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 416, 416}; + GetInput(g_yolo_img, &input, dims); + + std::vector vec_result; + // = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + // auto time3 = paddle_mobile::time(); + + // for (int i = 0; i < 10; ++i) { + // auto vec_result = paddle_mobile.Predict(input, dims); + // } + + // auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + // for (float i : vec_result) { + // std::cout << i << std::endl; + // } + } + return 0; +} diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp index 5f064d27f3f3f9cca5428467557c9412f76735c7..c027d4bd31d5ff41f42e9cd333618f8630aad5d9 100644 --- a/test/operators/test_batchnorm_op.cpp +++ b/test/operators/test_batchnorm_op.cpp @@ -125,7 +125,7 @@ template class TestBatchNormOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run BatchNormOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); /// input x (4,10,2,2) diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp index aeef10be9650623767af4d2de8913ce53b1d2c59..721e691107c2c2d0117fdedecf219484556c9541 100644 --- a/test/operators/test_box_coder_op.cpp +++ b/test/operators/test_box_coder_op.cpp @@ -114,7 +114,7 @@ template class TestBoxCoderOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run BoxCoderOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); paddle_mobile::framework::Tensor priorbox; diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp index edaa4ce1ddba251886c90262895333b0a56c3a07..1a347a9c37a96f3c31506d0b45f95e05b64292ff 100644 --- a/test/operators/test_concat_op.cpp +++ b/test/operators/test_concat_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/concat_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_conv_add_relu_op.cpp b/test/operators/test_conv_add_relu_op.cpp index 987f52cd62f91b3bc00cc1ef49bd21913e288d75..f170719218b98d341985a61ca6160884afe4ad3b 100644 --- a/test/operators/test_conv_add_relu_op.cpp +++ b/test/operators/test_conv_add_relu_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/fusion_conv_add_relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_googlenet, true); diff --git a/test/operators/test_cov_op.cpp b/test/operators/test_cov_op.cpp index a85ad9edba5d3e2256b8d7ee7d7d3c5b7200888d..535d82c4be6cedcc77e9e9cf97a9a813f4ca518d 100644 --- a/test/operators/test_cov_op.cpp +++ b/test/operators/test_cov_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/conv_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_googlenet); diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp index bd2aad19eda896bad3da8a47f5b70b1a923dc1a7..77c76eedc5690412dfee95dd11e8a3fe9ed6ecbe 100644 --- a/test/operators/test_depthwise_conv_op.cpp +++ b/test/operators/test_depthwise_conv_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/depthwise_conv_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_mobilenet_ssd); diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp index 0a5e9f7e92701e748df51078b21eb46eec90599d..3922b216cfc6ecf55be251ded02c0c064e2c3ffc 100644 --- a/test/operators/test_elementwise_add_op.cpp +++ b/test/operators/test_elementwise_add_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "../test_include.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp index e27361b21c3146675ea856d02d70878e73e8912f..e1030852976a68db827ebb7629caf8bb199a2456 100644 --- a/test/operators/test_elementwise_sub_op.cpp +++ b/test/operators/test_elementwise_sub_op.cpp @@ -104,7 +104,7 @@ template class TestElementwiseSubOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run ElementwiseSub Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp index 99c65ed821c0a90691070b661a6967a11d4694f7..9dc7bb13884efb8860a6670e088bd5af67c1f0ea 100644 --- a/test/operators/test_fill_constant_op.cpp +++ b/test/operators/test_fill_constant_op.cpp @@ -94,7 +94,7 @@ template class TestFillConstantOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run FillConstant Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp index 7764d95ed72da613459233bd55ddcffdc444318f..347bcb40a6156a576842af34920bde838dd83cd8 100644 --- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp +++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/fusion_conv_add_bn_relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_mobilenet, true); diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp index aaa2d7b578dbda4c6919210eb4a2fb42ba243e53..a8ec4883aab4218aa526e7b90267998754d1eb30 100644 --- a/test/operators/test_fusion_fc_op.cpp +++ b/test/operators/test_fusion_fc_op.cpp @@ -112,7 +112,7 @@ template class TestFcOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Fc Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // "../../../test/models/googlenet" auto program = loader.Load(g_googlenet); paddle_mobile::framework::ProgramOptimize optimize; diff --git a/test/operators/test_gru_op.cpp b/test/operators/test_gru_op.cpp index 52ab8b54d709391ea263b74a395a635ce50a18af..f2ce833661bfd1b3d751a7ac2d54cfb70114a6c6 100644 --- a/test/operators/test_gru_op.cpp +++ b/test/operators/test_gru_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/gru_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_nlp); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp index 6c69d1cc9d94ffd958251ee4ed783d6b5531c455..3cd172d99bb1bb9c24f035d501dce362476909c2 100644 --- a/test/operators/test_im2sequence_op.cpp +++ b/test/operators/test_im2sequence_op.cpp @@ -60,7 +60,6 @@ class TestIm2SequenceOp { Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1"); auto tensor_x1 = x1_feed_value->GetMutable(); tensor_x1->ShareDataWith(t1); - Variable *output = scope->Var("im2sequence_0.tmp_0"); auto *output_tensor = output->GetMutable(); output_tensor->mutable_data({2, 12}); @@ -100,7 +99,7 @@ template class TestIm2SequenceOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Im2Sequence Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_eng) + "/model", std::string(g_eng) + "/params"); diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp index d4d9f8da802fc0f5f885a3b2e81cba695776c29e..5d1ac9b4dd7225112ace8bfbb13f926502c77b94 100644 --- a/test/operators/test_lrn_op.cpp +++ b/test/operators/test_lrn_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/lrn_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp index 3447bbdd10b64d2c2f497bdb4d5af15958a9a95b..32c2c1f6bd682fdac8d9b81155b8aa044b87232b 100644 --- a/test/operators/test_multiclass_nms_op.cpp +++ b/test/operators/test_multiclass_nms_op.cpp @@ -111,9 +111,8 @@ template class TestMultiClassNMSOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run MulticlassNMS Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); - paddle_mobile::framework::Tensor inputx1; SetupTensor(&inputx1, {1, 2, 4}, static_cast(0), static_cast(1)); diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp index 5b30ce1ebfd59db972953e16e4506fa2595b8f04..2347f06989153b9ce5994fa0e4d09673ab2698f1 100644 --- a/test/operators/test_polygon_box_transform_op.cpp +++ b/test/operators/test_polygon_box_transform_op.cpp @@ -96,7 +96,7 @@ template class TestPolygonBoxTransformOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run PolygonBoxTransform Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr)); paddle_mobile::framework::Tensor input; diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp index 2daecd7b4c1a50c612bc784c801208d2e6f31482..09470caf82eb90df56f7aa79b6873c2a6b94fbef 100644 --- a/test/operators/test_pool_op.cpp +++ b/test/operators/test_pool_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/pool_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_googlenet)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_prelu_op.cpp b/test/operators/test_prelu_op.cpp index e93d8732d18496721b24cfba1df296250169f8b2..f98c9904ae3799cb863142b0fcb332c74c91ba98 100644 --- a/test/operators/test_prelu_op.cpp +++ b/test/operators/test_prelu_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/prelu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp index 2c75d01df297030b4633829ac4b29f7592aaf5c4..424f2443f8627002cff0adc19600f9aba50ad0fb 100644 --- a/test/operators/test_prior_box_op.cpp +++ b/test/operators/test_prior_box_op.cpp @@ -125,7 +125,7 @@ template class TestPriorBoxOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run PriorBoxOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); /// input x (1,3,300,300) diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp index fad0d0c30a126cc2730e4aa8b87364eee9fc8209..542d3d18f6a383c1e03962ba845b39c04a51631b 100644 --- a/test/operators/test_relu_op.cpp +++ b/test/operators/test_relu_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_reshape2_op.cpp b/test/operators/test_reshape2_op.cpp index 42c348a6274592eb23332620131faa0784a71d28..d0d51f984a617ea37713e5830adf6b5d248fb434 100644 --- a/test/operators/test_reshape2_op.cpp +++ b/test/operators/test_reshape2_op.cpp @@ -112,7 +112,7 @@ template class TestReshape2Op; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Reshape2 Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp index 3541151d8a1a286527e715f402df381d2efc094c..ff3299f5e818d8169a356323213707417d747dba 100644 --- a/test/operators/test_reshape_op.cpp +++ b/test/operators/test_reshape_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/reshape_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_resize_op.cpp b/test/operators/test_resize_op.cpp index f4dcaa6885d92a727e8c97d5106c3b6913a4ab33..c452ef8d850f97f6988688c4e47d5041220cb828 100644 --- a/test/operators/test_resize_op.cpp +++ b/test/operators/test_resize_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/resize_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp index 739c594ad7044025eaa3637d8669c43f1c6c6348..df93da1529ae1e03561643ebeef4cb821f10d211 100644 --- a/test/operators/test_sigmoid_op.cpp +++ b/test/operators/test_sigmoid_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h" #include "../../src/operators/kernel/sigmoid_kernel.h" #include "../test_helper.h" -#include "io/executor.h" +#include "framework/executor.h" int main() { paddle_mobile::framework::Tensor input; diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp index a0184729a8bc5e6b0ba952923eecd5242cfe36d4..f31bcb4e455a6b9699cf96271310681e51d4c6a7 100644 --- a/test/operators/test_softmax_op.cpp +++ b/test/operators/test_softmax_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/softmax_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp index 467529d8d3877fcb9ac5527daf5f037aea6d18fc..9cabf1212525a7d4d6f36c45f81cba438694843d 100644 --- a/test/operators/test_sum_op.cpp +++ b/test/operators/test_sum_op.cpp @@ -103,7 +103,7 @@ template class TestSumOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Sum Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_eng) + "/model", std::string(g_eng) + "/params"); diff --git a/test/operators/test_transpose2_op.cpp b/test/operators/test_transpose2_op.cpp index b75a957cd5c1cd08dc09895e9e2448761e822274..5da0faaf119c553e2fb019de76bb40f875f9d673 100644 --- a/test/operators/test_transpose2_op.cpp +++ b/test/operators/test_transpose2_op.cpp @@ -113,7 +113,7 @@ template class TestTranspose2Op; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Transpose2 Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp index f83ee23c25d8f2588e0fe40d5fabc6114129b995..263fdcfa0ed448b126f4b9cb01ace889318eeddb 100644 --- a/test/operators/test_transpose_op.cpp +++ b/test/operators/test_transpose_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "../test_include.h" #include "operators/transpose_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/test_helper.h b/test/test_helper.h index 41d6faed5229be8944178ea62786477ceadd6416..0eb11efd19b7d937f93eec14e163c8c42cb77f12 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -36,6 +36,7 @@ static const char *g_squeezenet = "../models/squeezenet"; static const char *g_googlenet = "../models/googlenet"; static const char *g_googlenet_quali = "../models/googlenet_combine_quali"; static const char *g_mobilenet = "../models/mobilenet"; +static const char *g_mobilenet_mul = "../models/mobilenet_mul"; static const char *g_alexnet = "../models/alexnet"; static const char *g_inceptionv4 = "../models/inceptionv4"; static const char *g_nlp = "../models/nlp"; @@ -44,8 +45,8 @@ static const char *g_resnet = "../models/resnet"; static const char *g_googlenet_combine = "../models/googlenet_combine"; static const char *g_yolo = "../models/yolo"; static const char *g_yolo_combined = "../models/yolo_combined"; +static const char *g_yolo_mul = "../models/yolo_mul"; static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; - static const char *g_test_image_1x3x224x224 = "../images/test_image_1x3x224x224_float"; static const char *g_test_image_1x3x224x224_banana = @@ -57,9 +58,12 @@ static const char *g_moto = "../images/moto_300x300_float"; static const char *g_imgfssd_ar = "../images/test_image_ssd_ar"; static const char *g_imgfssd_ar1 = "../images/003_0001.txt"; static const char *g_img = "../images/img.bin"; +static const char *g_yolo_img = "../images/in_put_1_3_416_416_2"; +static const char *g_mobilenet_img = "../images/image"; using paddle_mobile::framework::DDim; using paddle_mobile::framework::Tensor; +using namespace paddle_mobile; template void SetupTensor(paddle_mobile::framework::Tensor *input, diff --git a/third_party/opencl/OpenCL-Headers/CL/cl.h b/third_party/opencl/OpenCL-Headers/CL/cl.h new file mode 100644 index 0000000000000000000000000000000000000000..7224ed38faad33d8ed9c25acaeee26400c716aa6 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl.h @@ -0,0 +1,1783 @@ +/******************************************************************************* + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_device_svm_capabilities; +#endif +typedef cl_bitfield cl_command_queue_properties; +#ifdef CL_VERSION_1_2 +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; +#endif + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_queue_properties; +#endif +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_svm_mem_flags; +#endif +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +#ifdef CL_VERSION_1_2 +typedef cl_bitfield cl_mem_migration_flags; +#endif +typedef cl_uint cl_image_info; +#ifdef CL_VERSION_1_1 +typedef cl_uint cl_buffer_create_type; +#endif +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +#ifdef CL_VERSION_2_0 +typedef intptr_t cl_pipe_properties; +typedef cl_uint cl_pipe_info; +#endif +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +#ifdef CL_VERSION_1_2 +typedef cl_uint cl_program_binary_type; +#endif +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +#ifdef CL_VERSION_1_2 +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +#endif +typedef cl_uint cl_kernel_work_group_info; +#ifdef CL_VERSION_2_1 +typedef cl_uint cl_kernel_sub_group_info; +#endif +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_sampler_properties; +typedef cl_uint cl_kernel_exec_info; +#endif + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +#ifdef CL_VERSION_1_2 + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; +#ifdef __GNUC__ + __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ +#endif + union { + cl_mem buffer; + cl_mem mem_object; + }; +} cl_image_desc; + +#endif + +#ifdef CL_VERSION_1_1 + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + +#endif + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#ifdef CL_VERSION_1_1 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#endif +#ifdef CL_VERSION_1_2 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 +#endif + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#ifdef CL_VERSION_1_1 +#define CL_INVALID_PROPERTY -64 +#endif +#ifdef CL_VERSION_1_2 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#endif +#ifdef CL_VERSION_2_0 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 +#endif +#ifdef CL_VERSION_2_2 +#define CL_INVALID_SPEC_ID -71 +#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 +#endif + + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#ifdef CL_VERSION_1_2 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE +#endif + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 +#ifdef CL_VERSION_2_1 +#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 +#endif + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#endif +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#ifdef CL_VERSION_2_0 +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#endif +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#endif +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ +#ifdef CL_VERSION_1_1 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#endif +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#endif +#ifdef CL_VERSION_2_0 +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A +#endif +#ifdef CL_VERSION_2_1 +#define CL_DEVICE_IL_VERSION 0x105B +#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C +#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D +#endif + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#ifdef CL_VERSION_1_1 +#define CL_FP_SOFT_FLOAT (1 << 6) +#endif +#ifdef CL_VERSION_1_2 +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) +#endif + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#ifdef CL_VERSION_2_0 +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) +#endif + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#ifdef CL_VERSION_1_1 +#define CL_CONTEXT_NUM_DEVICES 0x1083 +#endif + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#ifdef CL_VERSION_1_2 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_device_svm_capabilities */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS (1 << 3) + +#endif + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 +#ifdef CL_VERSION_2_0 +#define CL_QUEUE_SIZE 0x1094 +#endif +#ifdef CL_VERSION_2_1 +#define CL_QUEUE_DEVICE_DEFAULT 0x1095 +#endif + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#ifdef CL_VERSION_1_2 +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ +#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +#endif + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#ifdef CL_VERSION_1_1 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#endif +#ifdef CL_VERSION_1_2 +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE +#endif +#ifdef CL_VERSION_2_0 +#define CL_sRGB 0x10BF +#define CL_sRGBx 0x10C0 +#define CL_sRGBA 0x10C1 +#define CL_sBGRA 0x10C2 +#define CL_ABGR 0x10C3 +#endif + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#ifdef CL_VERSION_1_2 +#define CL_UNORM_INT24 0x10DF +#endif +#ifdef CL_VERSION_2_1 +#define CL_UNORM_INT_101010_2 0x10E0 +#endif + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#ifdef CL_VERSION_1_2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_OBJECT_PIPE 0x10F7 +#endif + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#ifdef CL_VERSION_1_1 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_USES_SVM_POINTER 0x1109 +#endif + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#ifdef CL_VERSION_1_2 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_pipe_info */ +#define CL_PIPE_PACKET_SIZE 0x1120 +#define CL_PIPE_MAX_PACKETS 0x1121 + +#endif + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#ifdef CL_VERSION_1_1 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 +#endif + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 +#ifdef CL_VERSION_2_0 +#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 +#define CL_SAMPLER_LOD_MIN 0x1156 +#define CL_SAMPLER_LOD_MAX 0x1157 +#endif + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#ifdef CL_VERSION_1_2 +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) +#endif + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#ifdef CL_VERSION_1_2 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 +#endif +#ifdef CL_VERSION_2_1 +#define CL_PROGRAM_IL 0x1169 +#endif +#ifdef CL_VERSION_2_2 +#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A +#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B +#endif + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#ifdef CL_VERSION_1_2 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#endif +#ifdef CL_VERSION_2_0 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +#endif + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#ifdef CL_VERSION_1_2 +#define CL_KERNEL_ATTRIBUTES 0x1195 +#endif +#ifdef CL_VERSION_2_1 +#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 +#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_type_qualifier */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) +#ifdef CL_VERSION_2_0 +#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) +#endif + +#endif + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#ifdef CL_VERSION_1_2 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 +#endif + +#ifdef CL_VERSION_2_1 + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 +#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 + +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_kernel_exec_info */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 + +#endif + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#ifdef CL_VERSION_1_1 +#define CL_EVENT_CONTEXT 0x11D4 +#endif + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#ifdef CL_VERSION_1_1 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#endif +#ifdef CL_VERSION_1_2 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 +#endif +#ifdef CL_VERSION_2_0 +#define CL_COMMAND_SVM_FREE 0x1209 +#define CL_COMMAND_SVM_MEMCPY 0x120A +#define CL_COMMAND_SVM_MEMFILL 0x120B +#define CL_COMMAND_SVM_MAP 0x120C +#define CL_COMMAND_SVM_UNMAP 0x120D +#endif + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +#ifdef CL_VERSION_1_1 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +#endif + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 +#ifdef CL_VERSION_2_0 +#define CL_PROFILING_COMMAND_COMPLETE 0x1284 +#endif + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetDefaultDeviceCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceAndHostTimer(cl_device_id /* device */, + cl_ulong* /* device_timestamp */, + cl_ulong* /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetHostTimer(cl_device_id /* device */, + cl_ulong * /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithProperties(cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreatePipe(cl_context /* context */, + cl_mem_flags /* flags */, + cl_uint /* pipe_packet_size */, + cl_uint /* pipe_max_packets */, + const cl_pipe_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPipeInfo(cl_mem /* pipe */, + cl_pipe_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback(cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +#endif + +/* SVM Allocation APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAlloc(cl_context /* context */, + cl_svm_mem_flags /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFree(cl_context /* context */, + void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +/* Sampler APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSamplerWithProperties(cl_context /* context */, + const cl_sampler_properties * /* normalized_coords */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithIL(cl_context /* context */, + const void* /* il */, + size_t /* length */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramReleaseCallback(cl_program /* program */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_2_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramSpecializationConstant(cl_program /* program */, + cl_uint /* spec_id */, + size_t /* spec_size */, + const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCloneKernel(cl_kernel /* source_kernel */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointer(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfo(cl_kernel /* kernel */, + cl_kernel_exec_info /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_sub_group_info /* param_name */, + size_t /* input_value_size */, + const void* /*input_value */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1; + +#endif + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFree(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpy(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFill(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMap(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmap(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMigrateMem(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + const void ** /* svm_pointers */, + const size_t * /* sizes */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +#ifdef CL_VERSION_1_2 + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + /* + * WARNING: + * This API introduces mutable state into the OpenCL implementation. It has been REMOVED + * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the + * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. + * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. + * + * Software developers previously relying on this API are instructed to set the command queue + * properties when creating the queue, instead. + */ + extern CL_API_ENTRY cl_int CL_API_CALL + clSetCommandQueueProperty(cl_command_queue /* command_queue */, + cl_command_queue_properties /* properties */, + cl_bool /* enable */, + cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; +#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* Deprecated OpenCL 2.0 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h new file mode 100644 index 0000000000000000000000000000000000000000..d5960a43f72123bdd693da50d3ad9a3a82cd032c --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h new file mode 100644 index 0000000000000000000000000000000000000000..39f9072398a29ab0c5a91f3a08b8c75034e8ac17 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D11_H +#define __OPENCL_CL_D3D11_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d11_sharing */ +#define cl_khr_d3d11_sharing 1 + +typedef cl_uint cl_d3d11_device_source_khr; +typedef cl_uint cl_d3d11_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D11_DEVICE_KHR -1006 +#define CL_INVALID_D3D11_RESOURCE_KHR -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 + +/* cl_d3d11_device_source */ +#define CL_D3D11_DEVICE_KHR 0x4019 +#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A + +/* cl_d3d11_device_set */ +#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C + +/* cl_context_info */ +#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D +#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D + +/* cl_mem_info */ +#define CL_MEM_D3D11_RESOURCE_KHR 0x401E + +/* cl_image_info */ +#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( + cl_platform_id platform, + cl_d3d11_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D11_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h new file mode 100644 index 0000000000000000000000000000000000000000..2729e8b9e89a10dc410863140a904ee67250950d --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h @@ -0,0 +1,132 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* cl_khr_dx9_media_sharing */ +#define cl_khr_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_media_adapter_type_khr; +typedef cl_uint cl_dx9_media_adapter_set_khr; + +#if defined(_WIN32) +#include +typedef struct _cl_dx9_surface_info_khr +{ + IDirect3DSurface9 *resource; + HANDLE shared_handle; +} cl_dx9_surface_info_khr; +#endif + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 +#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 +#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 +#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 + +/* cl_media_adapter_type_khr */ +#define CL_ADAPTER_D3D9_KHR 0x2020 +#define CL_ADAPTER_D3D9EX_KHR 0x2021 +#define CL_ADAPTER_DXVA_KHR 0x2022 + +/* cl_media_adapter_set_khr */ +#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 +#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 + +/* cl_context_info */ +#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 +#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 +#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 + +/* cl_mem_info */ +#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 +#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 + +/* cl_image_info */ +#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B +#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( + cl_platform_id platform, + cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr * media_adapter_type, + void * media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( + cl_context context, + cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, + void * surface_info, + cl_uint plane, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..331bab97c74050724573be927774523fb24101df --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h @@ -0,0 +1,182 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_dx9_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_dx9_media_sharing extension * +****************************************/ + +#define cl_intel_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_device_source_intel; +typedef cl_uint cl_dx9_device_set_intel; + +/* error codes */ +#define CL_INVALID_DX9_DEVICE_INTEL -1010 +#define CL_INVALID_DX9_RESOURCE_INTEL -1011 +#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 +#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 + +/* cl_dx9_device_source_intel */ +#define CL_D3D9_DEVICE_INTEL 0x4022 +#define CL_D3D9EX_DEVICE_INTEL 0x4070 +#define CL_DXVA_DEVICE_INTEL 0x4071 + +/* cl_dx9_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 +#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 + +/* cl_context_info */ +#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 +#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 +#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 + +/* cl_mem_info */ +#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 +#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 + +/* cl_image_info */ +#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A +#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B +/******************************************************************************/ + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromDX9INTEL( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromDX9MediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_egl.h b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h new file mode 100644 index 0000000000000000000000000000000000000000..a765bd5266c02fc2fd2892f0257b228996d73c5f --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h @@ -0,0 +1,136 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ + +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context /* context */, + CLeglDisplayKHR /* egldisplay */, + CLeglImageKHR /* eglimage */, + cl_mem_flags /* flags */, + const cl_egl_image_properties_khr * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context /* context */, + CLeglSyncKHR /* sync */, + CLeglDisplayKHR /* display */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..af3ce461f3a48e7707caca966e704dfe5eb58e30 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h @@ -0,0 +1,723 @@ +/******************************************************************************* + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include + #include +#else + #include +#endif + +/* cl_khr_fp64 extension - no extension #define since it has no functions */ +/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ + +#if CL_TARGET_OPENCL_VERSION <= 110 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#endif + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + + +/******************************* + * cl_khr_il_program extension * + *******************************/ +#define cl_khr_il_program 1 + +/* New property to clGetDeviceInfo for retrieving supported intermediate + * languages + */ +#define CL_DEVICE_IL_VERSION_KHR 0x105B + +/* New property to clGetProgramInfo for retrieving for retrieving the IL of a + * program + */ +#define CL_PROGRAM_IL_KHR 0x1169 + +extern CL_API_ENTRY cl_program + CL_API_CALL clCreateProgramWithILKHR( + cl_context /* context */, + const void * /* il */, + size_t /* length */, + cl_int * /* errcode_ret */); + +typedef CL_API_ENTRY cl_program + (CL_API_CALL *clCreateProgramWithILKHR_fn)( + cl_context /* context */, + const void * /* il */, + size_t /* length */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +/* Extension: cl_khr_image2D_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without a copy. + * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. + * Both the sampler and sampler-less read_image built-in functions are supported for 2D images + * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported + * for 2D images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the width, + * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * + * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. + * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + */ + +/************************************** + * cl_khr_initialize_memory extension * + **************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 +#define CL_CONTEXT_TERMINATE_KHR 0x2032 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/***************************************** + * cl_khr_create_command_queue extension * + *****************************************/ +#define cl_khr_create_command_queue 1 + +typedef cl_bitfield cl_queue_properties_khr; + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithPropertiesKHR( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; +typedef CL_API_ENTRY cl_command_queue +(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ + +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ + +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 + + +/********************************* +* cl_arm_printf extension +*********************************/ + +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + + +/*********************************** +* cl_ext_device_fission extension +***********************************/ +#define cl_ext_device_fission 1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef cl_ulong cl_device_partition_property_ext; +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +/* cl_device_partition_property_ext */ +#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 +#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 +#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + +/* clDeviceGetInfo selectors */ +#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 +#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 +#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 +#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 +#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + +/* error codes */ +#define CL_DEVICE_PARTITION_FAILED_EXT -1057 +#define CL_INVALID_PARTITION_COUNT_EXT -1058 +#define CL_INVALID_PARTITION_NAME_EXT -1059 + +/* CL_AFFINITY_DOMAINs */ +#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 +#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 +#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 +#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 +#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 +#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + +/* cl_device_partition_property_ext list terminators */ +#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + + +/*********************************** + * cl_ext_migrate_memobject extension definitions + ***********************************/ +#define cl_ext_migrate_memobject 1 + +typedef cl_bitfield cl_mem_migration_flags_ext; + +#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 + +#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags_ext /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */ ); + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags_ext /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */ ); + + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ +#define cl_qcom_ext_host_ptr 1 + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + + +/******************************************* +* cl_qcom_ext_host_ptr_iocoherent extension +********************************************/ + +/* Cache policy specifying io-coherence */ +#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 + + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + + +/********************************* +* cl_qcom_android_native_buffer_host_ptr extension +*********************************/ + +#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 + +typedef struct _cl_mem_android_native_buffer_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* Virtual pointer to the android native buffer */ + void* anb_ptr; + +} cl_mem_android_native_buffer_host_ptr; + + +/****************************************** + * cl_img_yuv_image extension * + ******************************************/ + +/* Image formats used in clCreateImage */ +#define CL_NV21_IMG 0x40D0 +#define CL_YV12_IMG 0x40D1 + + +/****************************************** + * cl_img_cached_allocations extension * + ******************************************/ + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) +#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) + + +/****************************************** + * cl_img_use_gralloc_ptr extension * + ******************************************/ +#define cl_img_use_gralloc_ptr 1 + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 +#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 + +/* Error code from clEnqueueReleaseGrallocObjectsIMG */ +#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + + +/********************************* +* cl_khr_subgroups extension +*********************************/ +#define cl_khr_subgroups 1 + +#if !defined(CL_VERSION_2_1) +/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. + In hindsight, there should have been a khr suffix on this type for + the extension, but keeping it un-suffixed to maintain backwards + compatibility. */ +typedef cl_uint cl_kernel_sub_group_info; +#endif + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + + +/********************************* +* cl_khr_priority_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_priority_hints 1 + +typedef cl_uint cl_queue_priority_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_PRIORITY_KHR 0x1096 + +/* cl_queue_priority_khr */ +#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) +#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) +#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_throttle_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_throttle_hints 1 + +typedef cl_uint cl_queue_throttle_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_THROTTLE_KHR 0x1097 + +/* cl_queue_throttle_khr */ +#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) +#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) +#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_subgroup_named_barrier +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_subgroup_named_barrier 1 + +/* cl_device_info */ +#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 + + +/********************************** + * cl_arm_import_memory extension * + **********************************/ +#define cl_arm_import_memory 1 + +typedef intptr_t cl_import_properties_arm; + +/* Default and valid proporties name for cl_arm_import_memory */ +#define CL_IMPORT_TYPE_ARM 0x40B2 + +/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 + +/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 + +/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_SECURE_ARM 0x40B5 + +/* This extension adds a new function that allows for direct memory import into + * OpenCL via the clImportMemoryARM function. + * + * Memory imported through this interface will be mapped into the device's page + * tables directly, providing zero copy access. It will never fall back to copy + * operations and aliased buffers. + * + * Types of memory supported for import are specified as additional extension + * strings. + * + * This extension produces cl_mem allocations which are compatible with all other + * users of cl_mem in the standard API. + * + * This extension maps pages with the same properties as the normal buffer creation + * function clCreateBuffer. + */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clImportMemoryARM( cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; + + +/****************************************** + * cl_arm_shared_virtual_memory extension * + ******************************************/ +#define cl_arm_shared_virtual_memory 1 + +/* Used by clGetDeviceInfo */ +#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 + +/* Used by clGetMemObjectInfo */ +#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 + +/* Used by clSetKernelExecInfoARM: */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_SVM_FREE_ARM 0x40BA +#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB +#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC +#define CL_COMMAND_SVM_MAP_ARM 0x40BD +#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE + +/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) + +/* Flag values used by clSVMAllocARM: */ +#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) +#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) + +typedef cl_bitfield cl_svm_mem_flags_arm; +typedef cl_uint cl_kernel_exec_info_arm; +typedef cl_bitfield cl_device_svm_capabilities_arm; + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAllocARM(cl_context /* context */, + cl_svm_mem_flags_arm /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFreeARM(cl_context /* context */, + void * /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFreeARM(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpyARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFillARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMapARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmapARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointerARM(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2; +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfoARM(cl_kernel /* kernel */, + cl_kernel_exec_info_arm /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..1c358cfc10c5c01fa5b5bfcc65d4e5904f830a9e --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h @@ -0,0 +1,429 @@ +/******************************************************************************* + * Copyright (c) 2008-2017 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2017 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_ext_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __CL_EXT_INTEL_H +#define __CL_EXT_INTEL_H + +#ifdef __APPLE__ + #include + #include +#else + #include + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_thread_local_exec extension * +****************************************/ + +#define cl_intel_thread_local_exec 1 + +#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) + +/*********************************************** +* cl_intel_device_partition_by_names extension * +************************************************/ + +#define cl_intel_device_partition_by_names 1 + +#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 +#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 + +/************************************************ +* cl_intel_accelerator extension * +* cl_intel_motion_estimation extension * +* cl_intel_advanced_motion_estimation extension * +*************************************************/ + +#define cl_intel_accelerator 1 +#define cl_intel_motion_estimation 1 +#define cl_intel_advanced_motion_estimation 1 + +typedef struct _cl_accelerator_intel* cl_accelerator_intel; +typedef cl_uint cl_accelerator_type_intel; +typedef cl_uint cl_accelerator_info_intel; + +typedef struct _cl_motion_estimation_desc_intel { + cl_uint mb_block_type; + cl_uint subpixel_mode; + cl_uint sad_adjust_mode; + cl_uint search_path_type; +} cl_motion_estimation_desc_intel; + +/* error codes */ +#define CL_INVALID_ACCELERATOR_INTEL -1094 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 + +/* cl_accelerator_type_intel */ +#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 + +/* cl_accelerator_info_intel */ +#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 +#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 +#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 +#define CL_ACCELERATOR_TYPE_INTEL 0x4093 + +/* cl_motion_detect_desc_intel flags */ +#define CL_ME_MB_TYPE_16x16_INTEL 0x0 +#define CL_ME_MB_TYPE_8x8_INTEL 0x1 +#define CL_ME_MB_TYPE_4x4_INTEL 0x2 + +#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 + +#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 + +#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 +#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 +#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 + +#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 +#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 +#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 +#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 + +#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 +#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 +#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 + +#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 +#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 +#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 +#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 +#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 + +#define CL_ME_COST_PENALTY_NONE_INTEL 0x0 +#define CL_ME_COST_PENALTY_LOW_INTEL 0x1 +#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 +#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 + +#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 + +#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +/* cl_device_info */ +#define CL_DEVICE_ME_VERSION_INTEL 0x407E + +#define CL_ME_VERSION_LEGACY_INTEL 0x0 +#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 +#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 + +extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL +clCreateAcceleratorINTEL( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetAcceleratorInfoINTEL( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +/****************************************** +* cl_intel_simultaneous_sharing extension * +*******************************************/ + +#define cl_intel_simultaneous_sharing 1 + +#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 +#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 + +/*********************************** +* cl_intel_egl_image_yuv extension * +************************************/ + +#define cl_intel_egl_image_yuv 1 + +#define CL_EGL_YUV_PLANE_INTEL 0x4107 + +/******************************** +* cl_intel_packed_yuv extension * +*********************************/ + +#define cl_intel_packed_yuv 1 + +#define CL_YUYV_INTEL 0x4076 +#define CL_UYVY_INTEL 0x4077 +#define CL_YVYU_INTEL 0x4078 +#define CL_VYUY_INTEL 0x4079 + +/******************************************** +* cl_intel_required_subgroup_size extension * +*********************************************/ + +#define cl_intel_required_subgroup_size 1 + +#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 +#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 +#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A + +/**************************************** +* cl_intel_driver_diagnostics extension * +*****************************************/ + +#define cl_intel_driver_diagnostics 1 + +typedef cl_uint cl_diagnostics_verbose_level; + +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 + +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) + +/******************************** +* cl_intel_planar_yuv extension * +*********************************/ + +#define CL_NV12_INTEL 0x410E + +#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) +#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) + +#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E +#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F + +/******************************************************* +* cl_intel_device_side_avc_motion_estimation extension * +********************************************************/ + +#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B +#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C +#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D + +#define CL_AVC_ME_VERSION_0_INTEL 0x0; // No support. +#define CL_AVC_ME_VERSION_1_INTEL 0x1; // First supported version. + +#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 + +#define CL_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CL_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CL_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CL_AVC_ME_MINOR_4x4_INTEL 0x3 + +#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 +#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 +#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 +#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa + +#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 + +#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) + +#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 +#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +#define CL_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CL_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CL_AVC_ME_INTRA_4x4_INTEL 0x2 + +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 + +#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_EXT_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h new file mode 100644 index 0000000000000000000000000000000000000000..58b6449f9b4e98d561ee9a6f8b3daa6caede9f44 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h @@ -0,0 +1,175 @@ +/********************************************************************************** + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#ifdef CL_VERSION_1_2 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 +#endif + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#ifdef CL_VERSION_1_2 +#define CL_GL_NUM_SAMPLES 0x2012 +#endif + + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..e3c14c6408c44160103bcb4c0dcd230a674643a5 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h @@ -0,0 +1,74 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * For each extension, follow this template + * cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_platform.h b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..c2f408fed59fc42f9c2573061704610498890b40 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h @@ -0,0 +1,1460 @@ +/********************************************************************************** + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + #include + + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#else + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +/* + * Deprecation flags refer to the last version of the header in which the + * feature was not deprecated. + * + * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without + * deprecation but is deprecated in versions later than 1.1. + */ + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + #define CL_API_SUFFIX__VERSION_2_1 + #define CL_EXT_SUFFIX__VERSION_2_1 + #define CL_API_SUFFIX__VERSION_2_2 + #define CL_EXT_SUFFIX__VERSION_2_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif + #elif defined(_WIN32) + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623158e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >= 1500 + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ + #define __CL_HAS_ANON_STRUCT__ 1 + #define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #endif +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + + +/* ---- cl_halfn ---- */ +typedef union +{ + cl_half CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2; +#endif +}cl_half2; + +typedef union +{ + cl_half CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[2]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4; +#endif +}cl_half4; + +/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ +typedef cl_half4 cl_half3; + +typedef union +{ + cl_half CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[4]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[2]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8; +#endif +}cl_half8; + +typedef union +{ + cl_half CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[8]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[4]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8[2]; +#endif +#if defined( __CL_HALF16__ ) + __cl_half16 v16; +#endif +}cl_half16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >=1500 + #pragma warning( pop ) + #endif +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..28444288573219be06fa449bb50161a20e95acfc --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h @@ -0,0 +1,172 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_va_api_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + + +#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************** +* cl_intel_va_api_media_sharing extension * +*******************************************/ + +#define cl_intel_va_api_media_sharing 1 + +/* error codes */ +#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 +#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 +#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 +#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 + +/* cl_va_api_device_source_intel */ +#define CL_VA_API_DISPLAY_INTEL 0x4094 + +/* cl_va_api_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 +#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 + +/* cl_context_info */ +#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 + +/* cl_mem_info */ +#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 + +/* cl_image_info */ +#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A +#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B + +typedef cl_uint cl_va_api_device_source_intel; +typedef cl_uint cl_va_api_device_set_intel; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromVA_APIMediaAdapterINTEL( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromVA_APIMediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_version.h b/third_party/opencl/OpenCL-Headers/CL/cl_version.h new file mode 100644 index 0000000000000000000000000000000000000000..bb766cb9bbddca65a3cd599375a24cb827789d08 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_version.h @@ -0,0 +1,86 @@ +/******************************************************************************* + * Copyright (c) 2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __CL_VERSION_H +#define __CL_VERSION_H + +/* Detect which version to target */ +#if !defined(CL_TARGET_OPENCL_VERSION) +#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)") +#define CL_TARGET_OPENCL_VERSION 220 +#endif +#if CL_TARGET_OPENCL_VERSION != 100 && \ + CL_TARGET_OPENCL_VERSION != 110 && \ + CL_TARGET_OPENCL_VERSION != 120 && \ + CL_TARGET_OPENCL_VERSION != 200 && \ + CL_TARGET_OPENCL_VERSION != 210 && \ + CL_TARGET_OPENCL_VERSION != 220 +#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)") +#undef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 220 +#endif + + +/* OpenCL Version */ +#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) +#define CL_VERSION_2_2 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) +#define CL_VERSION_2_1 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) +#define CL_VERSION_2_0 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) +#define CL_VERSION_1_2 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) +#define CL_VERSION_1_1 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) +#define CL_VERSION_1_0 1 +#endif + +/* Allow deprecated APIs for older OpenCL versions. */ +#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) +#define CL_USE_DEPRECATED_OPENCL_2_1_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) +#define CL_USE_DEPRECATED_OPENCL_2_0_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_0_APIS +#endif + +#endif /* __CL_VERSION_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/opencl.h b/third_party/opencl/OpenCL-Headers/CL/opencl.h new file mode 100644 index 0000000000000000000000000000000000000000..9855cd75e7da064e094658b660851997c38a8c56 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/opencl.h @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + +#include +#include +#include +#include + +#else + +#include +#include +#include +#include + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/LICENSE b/third_party/opencl/OpenCL-Headers/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..020ce65fcac2a60e44dab1626fa4924dec17ea23 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2008-2015 The Khronos Group Inc. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and/or associated documentation files (the +"Materials"), to deal in the Materials without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Materials, and to +permit persons to whom the Materials are furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Materials. + +MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS +KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS +SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + https://www.khronos.org/registry/ + +THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. diff --git a/third_party/opencl/OpenCL-Headers/README.md b/third_party/opencl/OpenCL-Headers/README.md new file mode 100644 index 0000000000000000000000000000000000000000..757e56e152f8bc2fed68d2cdf38164c3171f929d --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/README.md @@ -0,0 +1,50 @@ +# OpenCLTM API Headers + +This repository contains C language headers for the OpenCL API. + +The authoritative public repository for these headers is located at: + +https://github.com/KhronosGroup/OpenCL-Headers + +Issues, proposed fixes for issues, and other suggested changes should be +created using Github. + +## Branch Structure + +The OpenCL API headers in this repository are Unified headers and are designed +to work with all released OpenCL versions. This differs from previous OpenCL +API headers, where version-specific API headers either existed in separate +branches, or in separate folders in a branch. + +## Compiling for a Specific OpenCL Version + +By default, the OpenCL API headers in this repository are for the latest +OpenCL version (currently OpenCL 2.2). To use these API headers to target +a different OpenCL version, an application may `#define` the preprocessor +value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers. +The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing +the OpenCL API version. + +For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may +include the OpenCL API headers as follows: + +``` +#define CL_TARGET_OPENCL_VERSION 120 +#include +``` + +## Directory Structure + +``` +README.md This file +LICENSE Source license for the OpenCL API headers +CL/ Unified OpenCL API headers tree +``` + +## License + +See [LICENSE](LICENSE). + +--- + +OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos. diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh index 7183612bd5945635acabdfb7bfbfb0d5a9188cc6..68cbc6cf858ed9fbf7f1fd2522cd897309e31f78 100644 --- a/tools/android-debug-script/push2android.sh +++ b/tools/android-debug-script/push2android.sh @@ -10,7 +10,7 @@ adb shell mkdir ${EXE_DIR} MODELS_DIR="/data/local/tmp/models" adb shell mkdir ${MODELS_DIR} for file in `ls ${MODELS_SRC}` -do +do adb shell mkdir ${MODELS_DIR}"/"${file} done diff --git a/tools/build.sh b/tools/build.sh index c6554105718304c195bb4a3326c80947719033a0..3489ccd7397ee79ad16256519dba4e239a4c53a0 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -90,6 +90,8 @@ build_for_android() { fi cd "../build/release/${PLATFORM}" make -j 8 + mkdir ./build/cl_kernel + cp ../../../src/operators/kernel/cl/cl_kernel/* ./build/cl_kernel/ } diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook index ece9ebc598e3fa63d1d76409dc0068854aaec851..92377d2dd6b53c69aaff41e4ea204b80fef31671 100644 --- a/tools/pre-commit.hooks/clang-format.hook +++ b/tools/pre-commit.hooks/clang-format.hook @@ -17,7 +17,7 @@ shift perl -i -pe 's|^\s+#pragma\s+omp|// #pragma omp|' "$@" ( # remove clang format ios_io folder -flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||') +flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||') clang-format -i $flist ) perl -i -pe 's|// ||' "$@"