diff --git a/CMakeLists.txt b/CMakeLists.txt index a80c50067e06d4317d89fcfcca96e14837e4d09d..020a9179d8a8c0127a4baf0ff51273a0337515a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) include_directories(src/) - if(IS_IOS) set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") else() @@ -145,16 +144,16 @@ endif() if (ANDROID_NDK_TOOLCHAIN_INCLUDED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) endif () if (IS_IOS) else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h) endif () set(CMAKE_VERBOSE_MAKEFILE ON) diff --git a/src/framework/cl/cl_half.cpp b/src/framework/cl/cl_half.cpp index 40f94c9d4d267ebb1c0a320da716bbf731d52244..2877289325d983d0c7d9756732254e0a4ed831b6 100644 --- a/src/framework/cl/cl_half.cpp +++ b/src/framework/cl/cl_half.cpp @@ -16,6 +16,9 @@ limitations under the License. */ #include "framework/cl/cl_half.h" +namespace paddle_mobile { +namespace framework { + static const uint32_t mantissatable[2048] = { 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, @@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { f_array[i] = Half2Float(h_array[i]); } } + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_half.h b/src/framework/cl/cl_half.h index fc864912b090adb1b673e4a2e1b35d832cada326..9b05740f1e19af66036a1562243102e5ba42ab1b 100644 --- a/src/framework/cl/cl_half.h +++ b/src/framework/cl/cl_half.h @@ -15,6 +15,9 @@ limitations under the License. */ #pragma once #include +namespace paddle_mobile { +namespace framework { + typedef uint16_t half_t; half_t Float2Half(float f); @@ -24,3 +27,6 @@ float Half2Float(half_t h); void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h index 8ebbcc911a558e3acbc5f54914300ca8226f0b0a..8640f6b1a4f6f07df5d8fbb06c7f28c3b6ed127a 100644 --- a/src/framework/cl/cl_helper.h +++ b/src/framework/cl/cl_helper.h @@ -64,6 +64,16 @@ class CLHelper { auto work_size_2 = n * h; + return {work_size_0, work_size_1, work_size_2}; + } else if (image_dim.size() == 2) { + auto image_width = image.ImageWidth(); + + auto work_size_0 = image_width / image_dim[1]; + + auto work_size_1 = image_dim[1]; + + auto work_size_2 = image_dim[0]; + return {work_size_0, work_size_1, work_size_2}; } PADDLE_MOBILE_THROW_EXCEPTION("not support this dim, need imp"); diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index 447e08a7012018eae1d09b056a77a233e74f2bee..a999971192ceb01299b3b03846a95ec257de61d3 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "cl_image.h" +#include "framework/cl/cl_image.h" + namespace paddle_mobile { namespace framework { void CLImageToTensor(CLImage *cl_image, Tensor *tensor, @@ -37,7 +38,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, size_t width = W * ((C + 3) / 4); size_t height = H * N; - float *p = tensor->data(); + float *p = tensor->mutable_data(); half imageData[width * height * 4]; cl_int err; cl_mem image = cl_image->GetCLImage(); @@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, } if (err != CL_SUCCESS) { - // TODO: error handling + CL_CHECK_ERRORS(err); } } void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, @@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, imageData, 0, NULL, NULL); if (err != CL_SUCCESS) { - // TODO: error handling + CL_CHECK_ERRORS(err); } size_t i0 = 0; for (int n = 0; n < N; n++) { @@ -116,5 +117,64 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, i0 += width * H; } } +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &cl_image) { + printer << " dims: " << cl_image.dims() << "\n"; + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + float *data = new float[cl_image.numel()]; + DDim ddim = cl_image.dims(); + size_t N, C, H, W; + if (ddim.size() == 4) { + N = ddim[0]; + if (N < 0) { + N = 1; + } + C = ddim[1]; + H = ddim[2]; + W = ddim[3]; + } else if (ddim.size() == 1) { + N = 1; + C = ddim[0]; + H = 1; + W = 1; + } + + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + float *p = data; + half imageData[width * height * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, imageData, 0, NULL, NULL); + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(imageData[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } + + CL_CHECK_ERRORS(err); + + for (int i = 0; i < cl_image.numel(); i += stride) { + printer << data[i] << " "; + } + return printer; +} +#endif } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 4c5ce128dd13fda4005135ae95048d028988c8bc..4ee64d77c1cb62df50ff8e0dce1bda22490cc567 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -46,27 +46,28 @@ class CLImage { /* * need call SetTensorData first * */ - void InitCLImage(cl_context context) { + void InitCLImage(cl_context context, cl_command_queue command_queue) { if (tensor_data_ == nullptr) { PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); } if (tensor_dims_.size() <= 2) { - InitCLImage2C(context, tensor_data_, tensor_dims_); + InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_); } else { - InitCLImage(context, tensor_data_, tensor_dims_); + InitCLImage(context, command_queue, tensor_data_, tensor_dims_); } delete[](tensor_data_); tensor_data_ = nullptr; initialized_ = true; } - void InitEmptyImage(cl_context context, const DDim &dim) { + void InitEmptyImage(cl_context context, cl_command_queue command_queue, + const DDim &dim) { if (tensor_data_ != nullptr) { PADDLE_MOBILE_THROW_EXCEPTION( " empty image tensor data shouldn't have value"); } DLOG << " init empty image "; - InitCLImage(context, nullptr, dim); + InitCLImage(context, command_queue, nullptr, dim); initialized_ = true; } @@ -93,6 +94,8 @@ class CLImage { * */ inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + inline cl_command_queue CommandQueue() const { return command_queue_; } + /* * resize original tensor dim * */ @@ -122,7 +125,9 @@ class CLImage { const DDim &dims() const { return tensor_dims_; } private: - void InitCLImage2C(cl_context context, float *tensor_data, const DDim &dim) { + void InitCLImage2C(cl_context context, cl_command_queue command_queue, + float *tensor_data, const DDim &dim) { + command_queue_ = command_queue; assert(dim.size() <= 2); int tdim[2] = {1, 1}; if (dim.size() == 1) { @@ -138,7 +143,8 @@ class CLImage { imageData.reset(new half_t[width * height * 4]); for (int h = 0; h < tdim[0]; h++) { for (int w = 0; w < tdim[1]; w++) { - imageData[(h * width + w / 4) * 4 + (w % 4)] = Float2Half(tensor_data[h * tdim[1] + w]); + imageData[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor_data[h * tdim[1] + w]); } } } @@ -149,35 +155,36 @@ class CLImage { cl_image_format cf = {.image_channel_order = CL_RGBA, .image_channel_data_type = CL_HALF_FLOAT}; cl_image_desc cid = { - .image_type = CL_MEM_OBJECT_IMAGE2D, - .image_width = width, - .image_height = height, - .image_depth = 1, - .image_array_size = 1, - .image_row_pitch = 0, - .image_slice_pitch = 0, - .num_mip_levels = 0, - .num_samples = 0, - // .buffer = nullptr + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = width, + .image_height = height, + .image_depth = 1, + .image_array_size = 1, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + // .buffer = nullptr }; cid.buffer = nullptr; cl_int err; cl_image_ = clCreateImage( - context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), - &cf, // const cl_image_format *image_format - &cid, // const cl_image_desc *image_desc - data, // void *host_ptr - &err - ); + context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), + &cf, // const cl_image_format *image_format + &cid, // const cl_image_desc *image_desc + data, // void *host_ptr + &err); if (err != CL_SUCCESS) { CL_CHECK_ERRORS(err); PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); } } - void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) { + void InitCLImage(cl_context context, cl_command_queue command_queue, + float *tensor_data, const DDim &dim) { DLOG << " tensor dim: " << dim; // NCHW -> [W * (C+3)/4, H * N] tensor_dims_ = dim; + command_queue_ = command_queue; if (tensor_data) { tensor_data_ = tensor_data; } @@ -203,6 +210,7 @@ class CLImage { image_width_ = width; image_height_ = height; image_dims_ = make_ddim({image_width_, image_height_}); + c_block_ = W / width; std::unique_ptr imageData{}; int count = 0; @@ -241,6 +249,7 @@ class CLImage { DDim image_dims_; float *tensor_data_; cl_context context_; + cl_command_queue command_queue_; }; void TensorToCLImage(Tensor *tensor, CLImage *image, diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h index c38091dd39c776254035f9b13c8505d64686915a..1d6829fe4b77639f34df0be37d7a539b91ff4bcc 100644 --- a/src/framework/cl/cl_tensor.h +++ b/src/framework/cl/cl_tensor.h @@ -28,7 +28,19 @@ namespace framework { class CLTensor : TensorBase { public: - explicit CLTensor(cl_context context) : context_(context) {} + CLTensor(cl_context context, cl_command_queue command_queue) + : context_(context), command_queue_(command_queue) {} + + CLTensor() = default; + + /* + * if init method haven't set context and command_queue, need set + * */ + void SetContextAndCommandQueue(cl_context context, + cl_command_queue command_queue) { + context_ = context; + command_queue_ = command_queue; + } /*! Resize the dimensions of the memory block. */ inline CLTensor &Resize(const DDim &dims) { @@ -39,7 +51,8 @@ class CLTensor : TensorBase { template inline T mutable_with_data(void *data) { int64_t size = numel() * sizeof(float); - holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_)); + holder_.reset( + new PlaceholderImpl(size, data, typeid(T), context_, command_queue_)); return reinterpret_cast( reinterpret_cast(reinterpret_cast(holder_->ptr()))); } @@ -51,7 +64,7 @@ class CLTensor : TensorBase { PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") int64_t size = numel() * SizeOfType(type); if (holder_ == nullptr || holder_->size() < size + offset_) { - holder_.reset(new PlaceholderImpl(size, type, context_)); + holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_)); offset_ = 0; } return reinterpret_cast( @@ -85,6 +98,7 @@ class CLTensor : TensorBase { private: cl_context context_; + cl_command_queue command_queue_; /* * virtual ~Placeholder() = default; @@ -99,20 +113,31 @@ class CLTensor : TensorBase { * */ struct PlaceholderImpl : public Placeholder { PlaceholderImpl(size_t size, void *input, std::type_index type, - cl_context context) + cl_context context, cl_command_queue command_queue) : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size, reinterpret_cast(input), NULL)), size_(size), - type_(type) {} + type_(type), + command_queue_(command_queue) {} - PlaceholderImpl(size_t size, std::type_index type, cl_context context) + PlaceholderImpl(size_t size, std::type_index type, cl_context context, + cl_command_queue command_queue) : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)), size_(size), - type_(type) {} + type_(type), + command_queue_(command_queue) {} virtual size_t size() const { return size_; } - virtual void *ptr() const { return static_cast(ptr_.get()); } + virtual void *ptr() const { + if (host_ptr_) { + delete (host_ptr_); + } + char *host_ptr = new char[size_]; + clEnqueueReadBuffer(command_queue_, ptr_.get(), CL_TRUE, 0, size_, + host_ptr, 0, NULL, NULL); + return static_cast(host_ptr); + } virtual std::type_index type() const { return type_; } @@ -124,6 +149,17 @@ class CLTensor : TensorBase { /* the current type of memory */ std::type_index type_; + + cl_command_queue command_queue_; + + ~PlaceholderImpl() { + if (host_ptr_) { + delete (host_ptr_); + } + } + + private: + void *host_ptr_; }; }; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index d61abac29e7946c75373f3168ede2df8ecc3a33d..80589706f94eb0c2331d5af0049c6d53df8ca876 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -87,7 +87,7 @@ Executor::Executor(const framework::Program p, int batch_size, for (int i = 0; i < blocks.size(); ++i) { std::shared_ptr block_desc = blocks[i]; std::vector> ops = block_desc->Ops(); - for (int j = 0; j < debug_to; ++j) { + for (int j = 0; j < ops.size(); ++j) { std::shared_ptr op = ops[j]; DLOG << "create op: " << j << " " << op->Type(); auto op_base = framework::OpRegistry::CreateOp( @@ -416,7 +416,7 @@ std::shared_ptr Executor::Predict( } } #else - for (int i = 0; i < debug_to; i++) { + for (int i = 0; i < ops.size(); i++) { #ifdef PADDLE_MOBILE_PROFILE struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); @@ -953,12 +953,14 @@ void Executor::InitMemory() { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { auto cl_image = var->template GetMutable(); cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); const framework::TensorDesc &desc = var_desc->Tensor_desc(); // framework::DDim ddim = framework::make_ddim(desc.Dims()); framework::DDim ddim = cl_image->dims(); DLOG << var_desc->Name(); - cl_image->InitEmptyImage(context, ddim); + cl_image->InitEmptyImage(context, command_queue, ddim); } } } @@ -1010,11 +1012,12 @@ void Executor::InitCombineMemory() { } else { auto cl_image = var->template GetMutable(); cl_context context = program_.scope->GetCLScpoe()->Context(); - + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); const framework::TensorDesc &desc = var_desc->Tensor_desc(); framework::DDim ddim = cl_image->dims(); // framework::DDim ddim = framework::make_ddim(desc.Dims()); - cl_image->InitEmptyImage(context, ddim); + cl_image->InitEmptyImage(context, command_queue, ddim); } } } diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 1bfac97c8bed8fea68b7d24b495ef6f4a2009340..ab9d4f788aa1eb8db8cc38b797c9f097ed260dac 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -57,10 +57,9 @@ void OperatorBase::CheckAllInputOutputSet() const {} template void OperatorBase::Run() { - DLOG << " begin run " << type_; + DLOG << " ----- Begin run impl --- " << type_ << " ----- "; RunImpl(); - DLOG << " end run " << type_; - return; + DLOG << " ----- End run impl --- " << type_ << " ----- "; #ifdef PADDLE_MOBILE_DEBUG DLOG << "-------------" << type_ << "----------------------------"; vector input_keys = GetInputKeys(); @@ -75,16 +74,8 @@ void OperatorBase::Run() { if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; } else { CLImage *cl_image = vari->template GetMutable(); - // cl_command_queue commandQueue = - // scope_->GetCLScpoe()->CommandQueue(); Tensor - // *tmp ; - // CLImageToTensor(cl_image,tmp,commandQueue); - // tmp->Resize(cl_image->dims()); - const float *input = cl_image->data(); if (cl_image) { - DLOG << type_ << " input- " << key << "=" << cl_image->dims(); - // if(input) - // DLOG<= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + const uint kernelHXW = 1; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); + output.x += dot(input, weight_x); + + half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1)); + output.y += dot(input, weight_y); + + half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2)); + output.z += dot(input, weight_z); + + half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); + output.w += dot(input, weight_w); + + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl index 3ec50f82d237d0ff62773229728f7ad867668b02..34a687dbb7d9d6424f57f85e94591f3b46e38a1d 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -12,6 +12,324 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + #define BIASE -#include "conv_kernel.inc.cl" -#undef + +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + #ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input[9]; + + + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + const uint kernelHXW = 1; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); + output.x += dot(input, weight_x); + + half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1)); + output.y += dot(input, weight_y); + + half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2)); + output.z += dot(input, weight_z); + + half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); + output.w += dot(input, weight_w); + + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index b45e9738f88965da8d7c026a67e73ddc92d73895..2b9936e94fba94c14ebe99fdae1fa6e963b87195 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, + #ifdef BIASE __read_only image2d_t bias, #endif diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index f24b2d6f932955533c9e71148d64ad69f33d9d9f..272e130817eda62f71a67e179a57ce63f024bc4d 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -29,8 +29,10 @@ bool ConvAddBNReluKernel::Init( param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); - param->Bias()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); // const CL *mean = param->InputMean(); const framework::CLImage *mean = param->InputMean(); @@ -38,6 +40,11 @@ bool ConvAddBNReluKernel::Init( const framework::CLImage *scale = param->InputScale(); const framework::CLImage *bias = param->InputBias(); const float epsilon = param->Epsilon(); + // + // DLOG << " climage mean: " << *mean; + // DLOG << " climage variance: " << *variance; + // DLOG << " climage scale: " << *scale; + // DLOG << " climage bias: " << *bias; auto mean_ptr = mean->data(); auto variance_ptr = variance->data(); @@ -62,12 +69,22 @@ bool ConvAddBNReluKernel::Init( framework::CLImage *new_scale = new framework::CLImage(); new_scale->SetTensorData(new_scale_ptr, variance->dims()); - new_scale->InitCLImage(this->cl_helper_.CLContext()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + DLOG << " climage - y bias: " << *(param->Bias()); + + DLOG << " climage - new scale: " << *new_scale; framework::CLImage *new_bias = new framework::CLImage(); new_bias->SetTensorData(new_bias_ptr, variance->dims()); - new_bias->InitCLImage(this->cl_helper_.CLContext()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + DLOG << " climage - new bias: " << *new_bias; + + DLOG << " climage - filter: " << *(param->Filter()); param->SetNewScale(new_scale); param->SetNewBias(new_bias); @@ -113,7 +130,7 @@ void ConvAddBNReluKernel::Compute( auto biase = param.Bias()->GetCLImage(); auto new_scale = param.NewScale()->GetCLImage(); auto new_bias = param.NewBias()->GetCLImage(); - auto output = param.Output(); + auto output = param.Output()->GetCLImage(); int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); @@ -126,23 +143,54 @@ void ConvAddBNReluKernel::Compute( cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); CL_CHECK_ERRORS(status); status = diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index ac36e70ed793ee000602114eb690780235b7d858..c6306aeba7fe1a00a6ee16c8d141aadf8d102d9c 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -25,8 +25,10 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); - param->Bias()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); @@ -71,27 +73,53 @@ void ConvAddKernel::Compute( cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); CL_CHECK_ERRORS(status); status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index bc50440273c48df6ea765716b2b7711d0765b007..27ebe18bafab685a1cb789dec7256f29d1c9bc2e 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -26,7 +26,8 @@ bool ConvKernel::Init(ConvParam *param) { param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); @@ -95,6 +96,17 @@ void ConvKernel::Compute(const ConvParam ¶m) { cl_int status; DLOG << " begin set kernel arg "; + DLOG << " c block " << c_block; + DLOG << " w " << w; + DLOG << " nh " << nh; + DLOG << " stride " << stride; + DLOG << " offset " << offset; + DLOG << " input_c " << input_c; + DLOG << " dilation " << dilation; + DLOG << " input width " << input_width; + DLOG << " input height " << input_height; + DLOG << " output width " << output_width; + DLOG << " output height " << output_height; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); CL_CHECK_ERRORS(status); diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index dcb95b4c6a9c42ff8c08f83611fc89aebc6efdfa..ad0c9958f817f96139fb93f2a33df0cdec833d35 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -27,7 +27,8 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Paddings()[0] == param->Paddings()[1], "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext()); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); int offset = static_cast(param->Filter()->dims()[2]) / 2 - static_cast(param->Paddings()[1]); param->SetOffset(offset); diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp index 6cc7b819b104ad3819065df3fe0d42fa923189bf..5b31a9c61df3cc0615659fb1544f32299d854aa9 100644 --- a/src/operators/kernel/cl/elementwise_add_kernel.cpp +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -22,16 +22,16 @@ namespace operators { template <> bool ElementwiseAddKernel::Init( ElementwiseAddParam *param) { - CLImage *bias = (CLImage*)param->InputY(); - bias->InitCLImage(cl_helper_.CLContext()); - if(bias->dims().size()==4){ - this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - }else if(param->InputY()->dims().size()==1){ - DLOG<<"-----init add-----"; - this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); - }else{ - DLOG << "error:bias dims is error"; - } + CLImage *bias = (CLImage *)param->InputY(); + bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); + if (bias->dims().size() == 4) { + this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + } else if (param->InputY()->dims().size() == 1) { + DLOG << "-----init add-----"; + this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); + } else { + DLOG << "error:bias dims is error"; + } return true; } @@ -44,7 +44,7 @@ void ElementwiseAddKernel::Compute( auto output = param.Out(); cl_int status; auto kernel = this->cl_helper_.KernelAt(0); - if(bias->dims().size()==4){ + if (bias->dims().size() == 4) { cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); @@ -57,14 +57,15 @@ void ElementwiseAddKernel::Compute( int width = input->ImageWidth(); int height = input->ImageHeight(); size_t global_work_size[2] = {width, height}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - }else if(bias->dims().size()==1){ + } else if (bias->dims().size() == 1) { cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); - int tensor_w = input->dims()[4]; + int tensor_w = input->dims()[3]; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_image); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&bias_image); @@ -76,13 +77,13 @@ void ElementwiseAddKernel::Compute( int width = input->ImageWidth(); int height = input->ImageHeight(); size_t global_work_size[2] = {width, height}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - }else{ + } else { DLOG << "error:bias dims is error"; } - } template class ElementwiseAddKernel; diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 6b4d883733634401f293ff304d63bfb3d913e134..0db2b7cc4665ff74d06ca62ba9e77d427d883233 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -30,12 +30,14 @@ void FeedKernel::Compute(const FeedParam ¶m) { cl_int status; auto output = param.Out(); const Tensor *input = param.InputX(); + DLOG << *input; const float *input_data = input->data(); int numel = input->numel(); cl_mem cl_image = output->GetCLImage(); int height = output->dims()[2]; int width = output->dims()[3]; - CLTensor input_cl_tensor(this->cl_helper_.CLContext()); + CLTensor input_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); input_cl_tensor.Resize(input->dims()); cl_mem inputBuffer = input_cl_tensor.mutable_with_data((void *)input_data); @@ -53,14 +55,6 @@ void FeedKernel::Compute(const FeedParam ¶m) { status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - - int len = 4 * 224 * 224; - half *out = new half[len]; - cl_command_queue commandQueue = this->cl_helper_.CLCommandQueue(); - size_t origin[3] = {0, 0, 0}; - size_t region[3] = {height, width, 1}; - clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out, - 0, NULL, NULL); } template class FeedKernel; diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index 995713ce5afaf0a93bc6b8ddd9928d7cee1c55ff..ceaf2f365a48dc0c41fd3da74d803bacb83b6cf6 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -19,44 +19,45 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + // this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); return true; } template <> void FetchKernel::Compute(const FetchParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); - - auto input = param.InputX()->GetCLImage(); - auto *out = param.Out(); - - const auto &dims = param.InputX()->dims(); - const int N = dims[0]; - const int C = dims[1]; - const int in_height = dims[2]; - const int in_width = dims[3]; - - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; - - // need create outputBuffer - cl_image_format imageFormat; - imageFormat.image_channel_order = CL_RGBA; - imageFormat.image_channel_data_type = CL_FLOAT; - cl_mem outputBuffer; - - clSetKernelArg(kernel, 0, sizeof(int), &in_height); - clSetKernelArg(kernel, 1, sizeof(int), &in_width); - clSetKernelArg(kernel, 2, sizeof(int), &size_ch); - clSetKernelArg(kernel, 3, sizeof(int), &size_block); - clSetKernelArg(kernel, 4, sizeof(int), &size_batch); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + // auto kernel = this->cl_helper_.KernelAt(0); + // auto default_work_size = + // this->cl_helper_.DefaultWorkSize(*param.InputX()); + // + // auto input = param.InputX()->GetCLImage(); + // auto *out = param.Out(); + // + // const auto &dims = param.InputX()->dims(); + // const int N = dims[0]; + // const int C = dims[1]; + // const int in_height = dims[2]; + // const int in_width = dims[3]; + // + // int size_ch = in_height * in_width; + // int size_block = size_ch * 4; + // int size_batch = size_ch * C; + // + // // need create outputBuffer + // cl_image_format imageFormat; + // imageFormat.image_channel_order = CL_RGBA; + // imageFormat.image_channel_data_type = CL_FLOAT; + // cl_mem outputBuffer; + // + // clSetKernelArg(kernel, 0, sizeof(int), &in_height); + // clSetKernelArg(kernel, 1, sizeof(int), &in_width); + // clSetKernelArg(kernel, 2, sizeof(int), &size_ch); + // clSetKernelArg(kernel, 3, sizeof(int), &size_block); + // clSetKernelArg(kernel, 4, sizeof(int), &size_batch); + // clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); + // clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); + // + // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + // default_work_size.data(), NULL, 0, NULL, NULL); } template class FetchKernel; diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp index 210932337ce676bbeff3e04f1c2e46b3051bc123..b0d1537da248b72c00f289d9863e19270dae631b 100644 --- a/src/operators/kernel/cl/reshape_kernel.cpp +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -37,19 +37,19 @@ void ReshapeKernel::Compute(const ReshapeParam ¶m) { int dims[4] = {1, 1, 1, 1}; int odims[4] = {1, 1, 1, 1}; for (int i = 0; i < inputDim.size(); i++) { - dims[4-inputDim.size()+i] = inputDim[i]; + dims[4 - inputDim.size() + i] = inputDim[i]; } for (int i = 0; i < outputDim.size(); i++) { - odims[4-outputDim.size()+i] = outputDim[i]; + odims[4 - outputDim.size() + i] = outputDim[i]; } - clSetKernelArg(kernel, 2, sizeof(int), dims); - clSetKernelArg(kernel, 3, sizeof(int), dims + 1); - clSetKernelArg(kernel, 4, sizeof(int), dims + 2); - clSetKernelArg(kernel, 5, sizeof(int), dims + 3); - clSetKernelArg(kernel, 6, sizeof(int), odims); - clSetKernelArg(kernel, 7, sizeof(int), odims + 1); - clSetKernelArg(kernel, 8, sizeof(int), odims + 2); - clSetKernelArg(kernel, 9, sizeof(int), odims + 3); + clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); + clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); + clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]); + clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]); + clSetKernelArg(kernel, 6, sizeof(cl_int), &odims); + clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index 1404ea40c703c8da2db09551fc6da440771f7366..a8196cf37607631e0cabb231c4e63a8ca338c28a 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -36,11 +36,14 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); const auto &inputDim = input->dims(); - int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]}; - clSetKernelArg(kernel, 2, sizeof(int), dims); - clSetKernelArg(kernel, 3, sizeof(int), dims + 1); - clSetKernelArg(kernel, 4, sizeof(int), dims + 2); - clSetKernelArg(kernel, 5, sizeof(int), dims + 3); + int dims[4] = {1, 1, 1, 1}; + for (int i = 0; i < inputDim.size(); i++) { + dims[4 - inputDim.size() + i] = inputDim[i]; + } + clSetKernelArg(kernel, 2, sizeof(int), &dims); + clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); + clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); + clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp index f69334daf2f24bdd4b41ee58e7236051d1459809..a5a78f7f8f6fe93ca7412f0ec007c291b26417af 100644 --- a/test/net/test_mobilenet_GPU.cpp +++ b/test/net/test_mobilenet_GPU.cpp @@ -23,7 +23,7 @@ int main() { // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // std::string(g_mobilenet_detect) + "/params", true); - auto isok = paddle_mobile.Load(g_mobilenet, false); + auto isok = paddle_mobile.Load(g_mobilenet, true); if (isok) { auto time2 = paddle_mobile::time(); std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook index ece9ebc598e3fa63d1d76409dc0068854aaec851..92377d2dd6b53c69aaff41e4ea204b80fef31671 100644 --- a/tools/pre-commit.hooks/clang-format.hook +++ b/tools/pre-commit.hooks/clang-format.hook @@ -17,7 +17,7 @@ shift perl -i -pe 's|^\s+#pragma\s+omp|// #pragma omp|' "$@" ( # remove clang format ios_io folder -flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||') +flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||') clang-format -i $flist ) perl -i -pe 's|// ||' "$@"