diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index f95750406dc2c3cdd8cec7fb163c7acb446bef17..37143775b0c1bac1baf5fefcd591e62ba6246645 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -20,6 +20,7 @@ #include <vector> #include "src/runtime/kernel/arm/fp32/arithmetic.h" #include "src/runtime/opencl/opencl_runtime.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h index de0aac1a09bd1d22fd9456f10bdd67dfea1e3878..1f2c115f876b5e55e317e09ab01d8c212dd94390 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h @@ -19,7 +19,7 @@ #include <vector> #include "ir/anf.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/arm/base/concat_base.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h index dac32964b53555b5ab9c4539bb08524f2d8dd639..c95ae57e5c0462065192d290045ea3847e85dbf1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h @@ -19,7 +19,7 @@ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/arm/opclib/conv_parameter.h" #include "src/runtime/opencl/opencl_runtime.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index def22a8c4d7c3f9aef7033b9bdc43a7ba0b3ec58..0baefd3aacebb6737aa78c86ecc36434c6b3225c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -19,7 +19,7 @@ #include <vector> #include "src/ir/tensor.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "schema/model_generated.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/arm/opclib/conv_parameter.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 02cbc61ae203263a59f70eb429fddc97b82eeaee..7761addbbeb5140b92fca19dc68f9f3bce77b440 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -17,10 +17,12 @@ #include "src/runtime/kernel/opencl/kernel/depthwise_conv2d.h" #include <string> #include <set> +#include <utility> #include "src/kernel_registry.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/arm/fp32/convolution_depthwise.h" #include "src/runtime/kernel/arm/opclib/pack.h" +#include "include/errorcode.h" #ifndef PROGRAM_WITH_IL @@ -29,9 +31,12 @@ #endif + +using mindspore::schema::PrimitiveType_DepthwiseConv2D; using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; -using mindspore::schema::PrimitiveType_DepthwiseConv2D; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; namespace mindspore::kernel { @@ -72,8 +77,8 @@ int DepthwiseConv2dOpenCLKernel::Init() { ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif this->InitBuffer(); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; - return 0; + MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(mem_type_); + return RET_OK; } int DepthwiseConv2dOpenCLKernel::InitBuffer() { @@ -109,10 +114,46 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { } else { MS_ASSERT(inputs_.size() == kInputSize1); } - return 0; + return RET_OK; } -int DepthwiseConv2dOpenCLKernel::ReSize() { return 0; } +int DepthwiseConv2dOpenCLKernel::ReSize() { + return RET_OK; +} + +int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) { + size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { + im_dst_x = outputs_[0]->Width() * CO4; + im_dst_y = outputs_[0]->Height(); + } else { + im_dst_y = outputs_[0]->Height() * CO4; + im_dst_x = outputs_[0]->Width(); + } +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; + *img_size = vec; + return RET_OK; +} +int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { + size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); + std::vector <size_t> global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4}; + *global_size = std::move(global); + return RET_OK; +} +int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t>& global_size, + std::vector<size_t>* local_size) { + size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); + std::vector <size_t> local = {1, 1, CO4}; + *local_size = std::move(local); + return RET_OK; +} int DepthwiseConv2dOpenCLKernel::Run() { MS_LOG(DEBUG) << this->Name() << " Running!"; @@ -120,8 +161,9 @@ int DepthwiseConv2dOpenCLKernel::Run() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); size_t CI4 = UP_DIV(inputs_[0]->Channel(), C4NUM); - std::vector<size_t> global = {(size_t)outputs_[0]->Width(), (size_t)outputs_[0]->Height(), CO4}; - std::vector<size_t> local = {1, 1, CO4}; + std::vector <size_t> global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4}; + std::vector <size_t> local; + GetLocalSize(0, global, &local); float relu_clip1 = 6.0; cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_}; @@ -141,53 +183,10 @@ int DepthwiseConv2dOpenCLKernel::Run() { ocl_runtime->SetKernelArg(kernel_, 8, dilation); ocl_runtime->SetKernelArg(kernel_, 9, src_size); ocl_runtime->SetKernelArg(kernel_, 10, dst_size); - if (mem_type_ == MEM_TYPE::BUF) { - ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, 4, outputs_[0]->Data()); - ocl_runtime->RunKernel(kernel_, global, local, nullptr); - } else { - cl::ImageFormat image_format; - { - image_format.image_channel_order = CL_RGBA; - image_format.image_channel_data_type = CL_FLOAT; - } - cl_int in_error_code; - size_t im_src_x, im_src_y; - size_t im_dst_x, im_dst_y; - if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { - im_src_x = inputs_[0]->Width() * CI4; - im_src_y = inputs_[0]->Height(); - im_dst_x = outputs_[0]->Width() * CO4; - im_dst_y = outputs_[0]->Height(); - } else { - im_src_y = inputs_[0]->Height() * CI4; - im_src_x = inputs_[0]->Width(); - im_dst_y = outputs_[0]->Height() * CO4; - im_dst_x = outputs_[0]->Width(); - } - cl::Image2D in_mem(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, im_src_x, - im_src_y, 0, inputs_[0]->Data(), &in_error_code); - cl_int out_error_code; - cl::Image2D out_mem(*ocl_runtime->Context(), CL_MEM_WRITE_ONLY, image_format, im_dst_x, im_dst_y, 0, nullptr, - &out_error_code); - if (in_error_code != CL_SUCCESS) { - MS_LOG(DEBUG) << "in Image2D Failed, error=" << in_error_code; - return 1; - } - if (out_error_code != CL_SUCCESS) { - MS_LOG(DEBUG) << "out Image2D Failed, error= " << out_error_code; - return 1; - } - auto origin = cl::array<cl::size_type, 3U>{0, 0, 0}; - auto region = cl::array<cl::size_type, 3U>{im_dst_x, im_dst_y, 1}; - ocl_runtime->SetKernelArg(kernel_, 0, in_mem); - ocl_runtime->SetKernelArg(kernel_, 4, out_mem); - - ocl_runtime->RunKernel(kernel_, global, local, nullptr); - ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(out_mem, CL_TRUE, origin, region, 0, 0, - outputs_[0]->Data()); - } - return 0; + ocl_runtime->SetKernelArg(kernel_, 0, inputs_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_, 4, outputs_[0]->Data()); + ocl_runtime->RunKernel(kernel_, global, local, nullptr); + return RET_OK; } kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 58e5a452418be243edbd13ca36c2c8bb721c751e..f4d0a1b8a2de34ae33f801a13d6a254ef596a32e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -18,17 +18,17 @@ #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_DEPTHWISE_H_ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/arm/opclib/conv_parameter.h" #include "src/runtime/opencl/opencl_runtime.h" namespace mindspore::kernel { -class DepthwiseConv2dOpenCLKernel : public LiteKernel { +class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { public: explicit DepthwiseConv2dOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, - const std::vector<lite::tensor::Tensor *> &outputs) - : LiteKernel(parameter, inputs, outputs), + const std::vector<lite::tensor::Tensor *> &outputs) + : OpenCLKernel(parameter, inputs, outputs), packed_weight_(nullptr), bias_data_(nullptr), kernel_(nullptr) {} ~DepthwiseConv2dOpenCLKernel() override {}; @@ -41,13 +41,18 @@ class DepthwiseConv2dOpenCLKernel : public LiteKernel { int InitBuffer(); + int GetImageSize(size_t idx, std::vector<size_t>* img_size) override; + int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) override; + int GetLocalSize(size_t idx, const std::vector<size_t>& global_size, + std::vector<size_t>* local_size) override; + private: FLOAT_t *packed_weight_; FLOAT_t *bias_data_; cl::Kernel kernel_; enum class MEM_TYPE { BUF, IMG - } mem_type_{MEM_TYPE::BUF}; + } mem_type_{MEM_TYPE::IMG}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index 90be6971039c4e407deae10d06e7a0f71c430751..fe2ef2b47ca187425607da52a41c1a77c57a4b9c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -19,7 +19,7 @@ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/arm/opclib/conv_parameter.h" #include "src/runtime/opencl/opencl_runtime.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index a39b43d187d9e79bfa22bc44986a5f1fd0439a48..53eec7e06f16b819f9ae7bb603b7b3a512b11b69 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -19,7 +19,7 @@ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/arm/opclib/fp32/pooling.h" #include "src/runtime/opencl/opencl_runtime.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h index 393fb846bd2c0accf57f1b3f12552030a225ed9b..93fdf3e81866d4c2ad6a01f41eeda50d23d725b5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h @@ -19,7 +19,7 @@ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/arm/opclib/fp32/softmax.h" #include "src/runtime/opencl/opencl_runtime.h" diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..cf76286e35045f9cb7a305a671d4b75beafa3c7e --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -0,0 +1,42 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ +#define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ + +#include <vector> +#include "src/lite_kernel.h" + +namespace mindspore::kernel { +class OpenCLKernel : public LiteKernel { + public: + explicit OpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, + const std::vector<lite::tensor::Tensor *> &outputs) + : LiteKernel(parameter, inputs, outputs) {} + + virtual int Init() { return -1; } + virtual int Prepare() { return -1; } + virtual int InferShape() { return -1; } + virtual int ReSize() { return -1; } + virtual int Run() { return -1; } + virtual int GetImageSize(size_t idx, std::vector<size_t>* img_size) { return -1; } + virtual int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) { return -1; } + virtual int GetLocalSize(size_t idx, const std::vector<size_t>& global_size, + std::vector<size_t>* local_size) { return -1; } +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc index 0d532f2991005933b8f678e97ae7bd5692def249..dcd2fe89435b4dacadc19b08cc9436355efc381a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc @@ -32,9 +32,10 @@ int SubGraphOpenCLKernel::Init() { } // Map buffer for write, it is not necessary for fine-grained for (auto &tensor : inputs_) { - void *data = allocator_->MapBuffer(tensor->Data(), CL_MAP_WRITE, nullptr, true); + void *data = tensor->Data(); // It is required with coarse-grained SVM if (data != nullptr) { + data = allocator_->MapBuffer(data, CL_MAP_WRITE, nullptr, true); tensor->SetData(data); } else { MS_LOG(ERROR) << "OpenCL kernel must use GPU buffer pointer, " diff --git a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h index 7786067cb21c1eb227f57655fcf31eb4fcba4709..7f7d5a343e0a78e0b4f2e261e6f10852555da635 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h @@ -18,7 +18,7 @@ #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_SUBGRAPH_OPENCL_KENEL_H_ #include <vector> -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/opencl/opencl_allocator.h" namespace mindspore::kernel { diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 23a3b177a11e546a14fb4464d84783eaf2c371c9..d646f4b5be5474c1b5efaec8fb5d9338eb5eec5a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -21,6 +21,7 @@ #include <vector> #include "CL/cl2.hpp" #include "utils/log_adapter.h" +#include "src/runtime/kernel/arm/opclib/op_base.h" namespace mindspore::kernel { @@ -81,7 +82,6 @@ std::vector<size_t> GetLocalSize(const std::vector<size_t> &global, int max_size std::string CLErrorCode(cl_int error_code); - } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_UTILS_H_ diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index 0e2f595330b0138b44748f6c11069210f58ffc07..ed579540b48c1e8b11bab9c802cf67239c837872 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -18,6 +18,7 @@ #include <utility> #include "utils/log_adapter.h" #include "src/runtime/opencl/opencl_runtime.h" +#include "include/errorcode.h" namespace mindspore::lite::opencl { @@ -61,7 +62,7 @@ void *OpenCLAllocator::Malloc(size_t size) { auto svm_capabilities = ocl_runtime->GetSVMCapabilities(); void *host_ptr = nullptr; void *device_ptr = nullptr; - if (svm_capabilities) { + if (svm_capabilities && svm_on_) { cl_svm_mem_flags flags = (svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0; flags |= (svm_capabilities & CL_DEVICE_SVM_ATOMICS) ? CL_MEM_SVM_ATOMICS : 0; flags = flags | CL_MEM_READ_WRITE; @@ -69,7 +70,7 @@ void *OpenCLAllocator::Malloc(size_t size) { } else { cl_int ret = CL_SUCCESS; cl::Buffer *buffer = - new cl::Buffer(*ocl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret); + new cl::Buffer(*ocl_runtime->Context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret); if (ret != CL_SUCCESS) { MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; UnLock(); @@ -77,7 +78,13 @@ void *OpenCLAllocator::Malloc(size_t size) { } device_ptr = static_cast<void *>(buffer); host_ptr = ocl_runtime->MapBuffer(*buffer, CL_MAP_READ | CL_MAP_WRITE, size); - ocl_runtime->UnmapBuffer(*buffer, host_ptr); + if (host_ptr == nullptr) { + MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr; + UnLock(); + return nullptr; + } + cl::Memory *mem = buffer; + ocl_runtime->UnmapBuffer(*mem, host_ptr); } std::unique_ptr<MemBuf> mem_buf = std::make_unique<MemBuf>(); mem_buf->size_ = size; @@ -90,6 +97,113 @@ void *OpenCLAllocator::Malloc(size_t size) { return host_ptr; } +void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t>& img_size) { + if (size > MAX_MALLOC_SIZE) { + MS_LOG(ERROR) << "MallocData out of max_size, size: " << size; + return nullptr; + } + auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); + Lock(); + auto iter = free_list_.lower_bound(size); + if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { + auto mem_buf = iter->second; + bool is_match{mem_buf->img_size.size() == img_size.size()}; + for (int i = 0; i < img_size.size() && is_match; ++i) { + is_match = img_size[i] == mem_buf->img_size[i]; + } + if (is_match) { + free_list_.erase(iter); + allocated_list_[mem_buf->host_ptr_] = mem_buf; + UnLock(); + MS_LOG(DEBUG) << "Malloc Image2D from free list. size: " << mem_buf->size_ + << ", host addr: " << mem_buf->host_ptr_ << ", device addr: " << mem_buf->device_ptr_; + return mem_buf->host_ptr_; + } + } + void *host_ptr = nullptr; + void *device_ptr = nullptr; + cl_int ret = CL_SUCCESS; + // CL_HALF_FLOAT, CL_FLOAT + cl::ImageFormat image_format(CL_RGBA, img_size[2]); + cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_WRITE, + image_format, img_size[0], img_size[1], 0, nullptr, &ret); + if (ret != CL_SUCCESS) { + MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; + UnLock(); + return nullptr; + } + device_ptr = static_cast<void *>(buffer); + std::vector<size_t> region{img_size[0], img_size[1], 1}; + host_ptr = ocl_runtime->MapBuffer(*buffer, 0, CL_MAP_READ | CL_MAP_WRITE, region); + if (host_ptr == nullptr) { + MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr; + UnLock(); + return nullptr; + } + cl::Memory *mem = buffer; + ocl_runtime->UnmapBuffer(*mem, host_ptr); + std::unique_ptr<MemBuf> mem_buf = std::make_unique<MemBuf>(); + mem_buf->size_ = size; + mem_buf->device_ptr_ = device_ptr; + mem_buf->host_ptr_ = host_ptr; + mem_buf->img_size = img_size; + MS_LOG(DEBUG) << "Malloc a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ + << ", device addr: " << mem_buf->device_ptr_; + allocated_list_[host_ptr] = mem_buf.release(); + UnLock(); + return host_ptr; +} + +void *OpenCLAllocator::CreateImageFromHost(void *data, size_t size, const std::vector<size_t>& img_size) { + if (size > MAX_MALLOC_SIZE) { + MS_LOG(ERROR) << "MallocData out of max_size, size: " << size; + return nullptr; + } + auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); + Lock(); + auto iter = free_list_.lower_bound(size); + if (iter != free_list_.end() && (iter->second->size_ >= size) && (iter->second->size_ < (size << shift_factor_))) { + auto mem_buf = iter->second; + free_list_.erase(iter); + allocated_list_[mem_buf->host_ptr_] = mem_buf; + UnLock(); + MS_LOG(DEBUG) << "Malloc Image2D from free list. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ + << ", device addr: " << mem_buf->device_ptr_; + return mem_buf->host_ptr_; + } + void *host_ptr = nullptr; + void *device_ptr = nullptr; + cl_int ret = CL_SUCCESS; + // CL_HALF_FLOAT, CL_FLOAT + cl::ImageFormat image_format(CL_RGBA, img_size[2]); + cl::Image2D *buffer = new cl::Image2D(*ocl_runtime->Context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_format, + img_size[0], img_size[1], 0, data, &ret); + if (ret != CL_SUCCESS) { + MS_LOG(ERROR) << "Create OpenCL Image2D failed! (ERROR CODE: " << ret << ")"; + UnLock(); + return nullptr; + } + device_ptr = static_cast<void *>(buffer); + std::vector<size_t> region{img_size[0], img_size[1], 1}; + host_ptr = ocl_runtime->MapBuffer(*buffer, 0, CL_MAP_READ | CL_MAP_WRITE, region); + if (host_ptr == nullptr) { + MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << device_ptr << ", host_ptr=" << host_ptr; + UnLock(); + return nullptr; + } + cl::Memory *mem = buffer; + ocl_runtime->UnmapBuffer(*mem, host_ptr); + std::unique_ptr<MemBuf> mem_buf = std::make_unique<MemBuf>(); + mem_buf->size_ = size; + mem_buf->device_ptr_ = device_ptr; + mem_buf->host_ptr_ = host_ptr; + mem_buf->img_size = img_size; + MS_LOG(DEBUG) << "Malloc a new Image2D. size: " << mem_buf->size_ << ", host addr: " << mem_buf->host_ptr_ + << ", device addr: " << mem_buf->device_ptr_; + allocated_list_[host_ptr] = mem_buf.release(); + UnLock(); + return host_ptr; +} void OpenCLAllocator::Free(void *buf) { if (buf == nullptr) { return; @@ -163,7 +277,7 @@ void OpenCLAllocator::Clear() { void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, bool sync) { auto ocl_runtime = opencl::OpenCLRuntime::GetInstance(); auto svm_capabilities = ocl_runtime->GetSVMCapabilities(); - if (svm_capabilities) { + if (svm_capabilities && svm_on_) { if (!(svm_capabilities & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)) { auto it = allocated_list_.find(host_ptr); if (it == allocated_list_.end()) { @@ -178,11 +292,25 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, auto it = allocated_list_.find(host_ptr); if (it == allocated_list_.end()) { MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr; + UnLock(); return nullptr; } MemBuf *mem_buf = it->second; - cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); - void *new_host_ptr = ocl_runtime->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); + void *new_host_ptr{nullptr}; + if (mem_buf->img_size.empty()) { + cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); + new_host_ptr = ocl_runtime->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); + } else { + cl::ImageFormat image_format(CL_RGBA, mem_buf->img_size[2]); + std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1}; + cl::Image2D *buffer = static_cast<cl::Image2D *>(mem_buf->device_ptr_); + new_host_ptr = ocl_runtime->MapBuffer(*buffer, 0, CL_MAP_READ | CL_MAP_WRITE, region); + } + if (new_host_ptr == nullptr) { + MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << mem_buf->device_ptr_ << ", host_ptr=" << host_ptr; + UnLock(); + return nullptr; + } mem_buf->host_ptr_ = new_host_ptr; allocated_list_.erase(it); allocated_list_[new_host_ptr] = mem_buf; @@ -208,5 +336,40 @@ int OpenCLAllocator::UnmapBuffer(void *host_ptr, void *command_queue) { return ocl_runtime->UnmapBuffer(*buffer, it->second->host_ptr_, static_cast<cl::CommandQueue *>(command_queue)); } +MEM_TYPE OpenCLAllocator::GetMemType(void *host_ptr) { + MEM_TYPE mem_type{MEM_TYPE::BUF}; + Lock(); + auto it = allocated_list_.find(host_ptr); + if (it == allocated_list_.end()) { + MS_LOG(ERROR) << "Can not found buffer :" << host_ptr; + UnLock(); + return mem_type; + } + MemBuf *mem_buf = it->second; + if (mem_buf->img_size.empty()) { + mem_type = MEM_TYPE::BUF; + } else { + mem_type = MEM_TYPE::IMG; + } + UnLock(); + return mem_type; +} + +int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t>* img_size) { + Lock(); + auto it = allocated_list_.find(host_ptr); + if (it == allocated_list_.end()) { + MS_LOG(ERROR) << "Can not found buffer :" << host_ptr; + UnLock(); + return RET_OK; + } + MemBuf *mem_buf = it->second; + if (!mem_buf->img_size.empty()) { + *img_size = mem_buf->img_size; + } + UnLock(); + return RET_OK; +} + } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h index e8f6578347f483e34fdc19598fbe59eb44e3939f..0664020096e6de71c46e74d795d652113839c2cf 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h @@ -39,18 +39,27 @@ struct OpenclMemory { OpenCLMemoryType mem_type{MS_HOST_BUFFER | MS_CL_BUFFER}; }; +enum class MEM_TYPE : char { + BUF, IMG +}; + class OpenCLAllocator : public Allocator { public: OpenCLAllocator(); ~OpenCLAllocator() override; void SetContext(const AllocatorContext &ctx) override; void *Malloc(size_t size) override; + void *Malloc(size_t size, const std::vector<size_t>& img_size); + void *CreateImageFromHost(void *host_ptr, size_t size, const std::vector<size_t>& img_size); void Free(void *ptr) override; size_t GetTotalSize() override; + void Clear() override; void *GetDeviceBuffer(void *buffer); void *MapBuffer(void *host_ptr, int flags, void *command_queue = nullptr, bool sync = true); int UnmapBuffer(void *host_ptr, void *command_queue = nullptr); + MEM_TYPE GetMemType(void *host_ptr); + int GetImageSize(void *host_ptr, std::vector<size_t>* img_size); private: void Lock(); @@ -59,6 +68,7 @@ class OpenCLAllocator : public Allocator { size_t size_; void *device_ptr_; void *host_ptr_; + std::vector<size_t> img_size; }; std::mutex lock; @@ -68,6 +78,7 @@ class OpenCLAllocator : public Allocator { // 6 is empirical value int shift_factor_ = 6; bool lock_flag_ = false; + bool svm_on_{false}; }; } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index 216c9121fc086bac8c21852dcc9c1cb39f972a91..e57d1c30735f88df8a79f62d73c76806fa663bae 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -15,9 +15,10 @@ */ #include "src/runtime/opencl/opencl_executor.h" +#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/arm/opclib/pack.h" -#include "include/errorcode.h" #include "src/common/ms_tensor_utils.h" +#include "include/errorcode.h" namespace mindspore::lite::opencl { int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tensor::Tensor *> &outputs, @@ -29,23 +30,32 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso MS_LOG(ERROR) << "Graph input tensor is nullptr"; return RET_ERROR; } - if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4) { - if (inTensor->GetFormat() != schema::Format_NHWC) { - MS_LOG(ERROR) << "Model input should be NHWC, actual is " << schema::EnumNameFormat(inTensor->GetFormat()); - return RET_ERROR; - } else { - TransformTensorLayout(inTensor, schema::Format_NHWC4); - // TransformTensorLayout(inTensor, schema::Format_NC4HW4); - } + if (inTensor->GetFormat() != schema::Format_NHWC4 && inTensor->GetFormat() != schema::Format_NC4HW4 && + inTensor->GetFormat() != schema::Format_NHWC) { + MS_LOG(ERROR) << "input should be NHWC/NHWC4/NC4HW4, actual is " << schema::EnumNameFormat(inTensor->GetFormat()); + return RET_ERROR; + } else { + TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NHWC4, true); + // TransformTensorLayout(inTensor, inTensor->GetFormat(), schema::Format_NC4HW4, true); } } kernel::LiteKernelUtil::InitTensorRefCount(kernels); + OpenCLAllocator* op_allocator = reinterpret_cast<OpenCLAllocator*>(allocator); for (auto *kernel : kernels) { MS_ASSERT(nullptr != kernel); + kernel::OpenCLKernel *op_kernel = reinterpret_cast<kernel::OpenCLKernel*>(kernel); auto &outputs = kernel->GetOutputs(); - for (auto *output : outputs) { + for (auto i = 0; i < outputs.size(); ++i) { + auto *output = outputs.at(i); MS_ASSERT(nullptr != output); - output->MallocData(); + if (is_image2d_out_) { + std::vector<size_t> img_size; + op_kernel->GetImageSize(i, &img_size); + auto data_ptr = op_allocator->Malloc(output->Size(), img_size); + output->SetData(data_ptr); + } else { + output->MallocData(allocator); + } } session::CallBackParam callbackParam; callbackParam.name_callback_param = kernel->Name(); @@ -81,21 +91,22 @@ int OpenCLExecutor::Run(std::vector<tensor::Tensor *> &inputs, std::vector<tenso return RET_ERROR; } if (outTensor->GetFormat() != schema::Format_NHWC) { - MS_LOG(ERROR) << "Model output tensor should be NHWC"; + TransformTensorLayout(outTensor, outTensor->GetFormat(), schema::Format_NHWC, false); } } return RET_OK; } -int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format dst_format) { +int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format, bool trans_dir) { MS_ASSERT(nullptr != tensor); MS_ASSERT(4 == tensor->shape().size()); auto data_type = tensor->data_type(); switch (data_type) { case kNumberTypeInt8: - return TransformTensorLayoutUint8(tensor, dst_format); + return TransformTensorLayoutUint8(tensor, src_format, dst_format, trans_dir); case kNumberTypeFloat32: - return TransformTensorLayoutFp32(tensor, dst_format); + return TransformTensorLayoutFp32(tensor, src_format, dst_format, trans_dir); default: MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " << schema::EnumNameFormat(dst_format); @@ -104,21 +115,103 @@ int OpenCLExecutor::TransformTensorLayout(tensor::Tensor *tensor, schema::Format return RET_OK; } -int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format dst_format) { +int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format, bool trans_dir) { MS_ASSERT(nullptr != tensor); MS_ASSERT(nullptr != allocator_); MS_ASSERT(4 == tensor->shape().size()); + if (trans_dir) { + if (is_image2d_out_) { + return TransformTensorLayoutToImage(tensor, src_format, dst_format); + } else { + return TransformTensorLayoutToBuffer(tensor, src_format, dst_format); + } + } else { + if (is_image2d_out_) { + return TransformTensorLayoutFromImage(tensor, src_format, dst_format); + } else { + return TransformTensorLayoutToBuffer(tensor, src_format, dst_format); + } + } +} + +int OpenCLExecutor::TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format) { if (dst_format == schema::Format_NHWC4) { auto *src_data = tensor->Data(); - auto *dst_data = allocator_->Malloc(tensor->Size()); - if (dst_data == nullptr) { - MS_LOG(ERROR) << "Malloc data failed"; - return RET_ERROR; + size_t C4 = UP_DIV(tensor->Channel(), C4NUM); + std::vector <size_t> img_size{tensor->Width() * C4, (size_t) tensor->Height(), CL_FLOAT}; + if (src_format == schema::Format_NHWC) { + auto *dst_data = allocator_->Malloc(tensor->Size(), img_size); + if (dst_data == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + dst_data = reinterpret_cast<FLOAT_t *>(allocator_->MapBuffer(dst_data, CL_MAP_WRITE, nullptr, true)); + PackNHWCToNHWC4Fp32(src_data, dst_data, tensor->Batch(), tensor->Height() * tensor->Width(), tensor->Channel()); + tensor->SetData(dst_data); + allocator_->Free(src_data); + allocator_->UnmapBuffer(dst_data); } - dst_data = reinterpret_cast<FLOAT_t *>(allocator_->MapBuffer(dst_data, CL_MAP_WRITE, nullptr, true)); - PackNHWCToNHWC4Fp32(src_data, dst_data, tensor->Batch(), tensor->Height() * tensor->Width(), tensor->Channel()); - tensor->SetData(dst_data); tensor->SetFormat(dst_format); + return RET_OK; + } else if (dst_format == schema::Format_NHWC) { + // TODO(wandongdong): add support !! + return RET_OK; + } else { + MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " + << schema::EnumNameFormat(dst_format) << " in float32"; + return RET_ERROR; + } +} + +int OpenCLExecutor::TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format) { + if (dst_format == schema::Format_NHWC4) { + // convert to nhwc4 + auto *src_data = tensor->Data(); + auto *dst_data{src_data}; + if (src_format == schema::Format_NHWC) { + dst_data = allocator_->Malloc(tensor->Size()); + if (dst_data == nullptr) { + MS_LOG(ERROR) << "Malloc data failed"; + return RET_ERROR; + } + dst_data = reinterpret_cast<FLOAT_t *>(allocator_->MapBuffer(dst_data, CL_MAP_WRITE, nullptr, true)); + PackNHWCToNHWC4Fp32(src_data, dst_data, tensor->Batch(), tensor->Height() * tensor->Width(), tensor->Channel()); + tensor->SetData(dst_data); + allocator_->Free(src_data); + allocator_->UnmapBuffer(dst_data); + } + // copy to image2d + src_data = dst_data; + size_t C4 = UP_DIV(tensor->Channel(), C4NUM); + std::vector<size_t> img_size{tensor->Width() * C4, (size_t)tensor->Height(), CL_FLOAT}; + dst_data = allocator_->CreateImageFromHost(src_data, tensor->Size(), img_size); + tensor->SetData(dst_data); + allocator_->Free(src_data); + tensor->SetFormat(schema::Format_NHWC4); + return RET_OK; + } else { + MS_LOG(ERROR) << "Unsupport layout transform: " << schema::EnumNameFormat(tensor->GetFormat()) << " to " + << schema::EnumNameFormat(dst_format) << " in float32"; + return RET_ERROR; + } +} + +int OpenCLExecutor::TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format) { + if (dst_format == schema::Format_NHWC) { + auto src_data = tensor->Data(); + auto dst_data = allocator_->Malloc(tensor->Size()); + cl::Image2D *out_mem = reinterpret_cast<cl::Image2D *>(allocator_->GetDeviceBuffer(src_data)); + std::vector<size_t> img_size; + allocator_->GetImageSize(src_data, &img_size); + auto origin = cl::array < cl::size_type, 3U > {0, 0, 0}; + auto region = cl::array < cl::size_type, 3U > {img_size[0], img_size[1], 1}; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + ocl_runtime->GetDefaultCommandQueue()->enqueueReadImage(*out_mem, CL_TRUE, origin, region, 0, 0, dst_data); + tensor->SetData(dst_data); allocator_->Free(src_data); return RET_OK; } else { @@ -128,7 +221,8 @@ int OpenCLExecutor::TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Fo } } -int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format dst_format) { +int OpenCLExecutor::TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, + schema::Format dst_format, bool is_image) { MS_ASSERT(nullptr != tensor); MS_ASSERT(4 == tensor->shape().size()); // auto src_format = tensor->GetFormat(); diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.h b/mindspore/lite/src/runtime/opencl/opencl_executor.h index 6c0308ff7bab6a209ed0bc67bc3ebd9d285db5f3..d40a13574fe9606a02d81761703e52696e456362 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.h +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.h @@ -20,7 +20,7 @@ #include <vector> #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/allocator.h" -#include "src/lite_kernel.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/executor.h" #include "include/lite_session.h" @@ -38,15 +38,25 @@ class OpenCLExecutor : Executor { const session::KernelCallBack &before = nullptr, const session::KernelCallBack &after = nullptr); protected: - int TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format dst_format); + int TransformTensorLayoutFp32(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, + bool trans_dir = false); - int TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format dst_format); + int TransformTensorLayoutUint8(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, + bool trans_dir = false); - int TransformTensorLayout(tensor::Tensor *tensor, schema::Format dst_format); + int TransformTensorLayout(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format, + bool trans_dir = false); + + int TransformTensorLayoutToBuffer(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format); + + int TransformTensorLayoutToImage(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format); + + int TransformTensorLayoutFromImage(tensor::Tensor *tensor, schema::Format src_format, schema::Format dst_format); protected: Context *context = nullptr; OpenCLAllocator *allocator_; + bool is_image2d_out_{true}; }; } // namespace mindspore::lite::opencl diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index c4993a253349c9eb1fd99ee398d58dbc86232015..d503e2a32e6eb333c96cc127260486035461ed16 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -124,8 +124,13 @@ int OpenCLRuntime::Init() { const std::string device_name = device_->getInfo<CL_DEVICE_NAME>(); const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>(); const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>(); + cl_uint align; + size_t ret; + clGetDeviceInfo((*device_)(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &align, &ret); MS_LOG(INFO) << "Device name:\t" << device_name; MS_LOG(INFO) << "Opencl version:\t" << device_version; + MS_LOG(INFO) << "Image alignment:\t" << align; + MS_LOG(INFO) << "Image ret:\t" << ret; MS_LOG(INFO) << "Highest OpenCL c version:\t" << opencl_version; MS_LOG(INFO) << "Max work item size:\t" << max_work_item_sizes_[0] << " : " @@ -133,7 +138,6 @@ int OpenCLRuntime::Init() { << max_work_item_sizes_[2]; gpu_info_ = ParseGpuInfo(device_name, device_version); - cl_int err; #if defined(SHARING_MEM_WITH_OPENGL) && (CL_HPP_TARGET_OPENCL_VERSION >= 120) // create context from glcontext @@ -164,6 +168,7 @@ int OpenCLRuntime::Init() { support_fp16_ = CL_SUCCESS == success && fp_config > 0; err = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_); + svm_capabilities_ = 0; if (err != CL_SUCCESS || svm_capabilities_ == 0) { svm_capabilities_ = 0; MS_LOG(INFO) << "SVM capalibilties: " @@ -535,7 +540,19 @@ int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::Command return command_queue->enqueueMapSVM(host_ptr, sync, flags, size); } -int OpenCLRuntime::UnmapBuffer(const cl::Buffer buffer, void *host_ptr, cl::CommandQueue *command_queue) const { +void *OpenCLRuntime::MapBuffer(const cl::Image2D buffer, bool sync, int flags, + const std::vector<size_t>& region, cl::CommandQueue *command_queue) const { + if (command_queue == nullptr) { + command_queue = default_command_queue_.get(); + } + cl::size_type row_pitch; + cl::size_type slice_pitch; + cl::array<cl::size_type, 3> origin_{0, 0, 0}; + cl::array<cl::size_type, 3> region_{region[0], region[1], region[2]}; + return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch); +} + +int OpenCLRuntime::UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue) const { if (command_queue == nullptr) { command_queue = default_command_queue_.get(); } diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index 64593f553bd6942a1f55060aa9e38696365a4b4a..173d0416d67ac47f46dac04b7cf4f71ed07c0310 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -75,9 +75,16 @@ class OpenCLRuntime { MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; return clSetKernelArgSVMPointer(kernel, index, value); } else { - cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetDeviceBuffer(value)); - MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << value; - return clSetKernelArg(kernel, index, sizeof((*buffer)()), &(*buffer)()); + MEM_TYPE mem_type = allocator_->GetMemType(value); + if (mem_type == MEM_TYPE::BUF) { + cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetDeviceBuffer(value)); + MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << value; + return clSetKernelArg(kernel, index, sizeof((*buffer)()), &(*buffer)()); + } else { + cl::Image2D *buffer = reinterpret_cast<cl::Image2D *>(allocator_->GetDeviceBuffer(value)); + MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Image2D " << value; + return clSetKernelArg(kernel, index, sizeof((*buffer)()), &(*buffer)()); + } } } @@ -107,9 +114,11 @@ class OpenCLRuntime { bool sync = false) const; void *MapBuffer(const cl::Buffer buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, bool sync = false) const; + void *MapBuffer(const cl::Image2D buffer, bool sync, int flags, + const std::vector<size_t>& region, cl::CommandQueue *command_queue = nullptr) const; int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, bool sync = false) const; - int UnmapBuffer(const cl::Buffer buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; + int UnmapBuffer(const cl::Memory buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc index e7f8aa5209ed23d678404e9ff04cf5da0fdb10f0..54f8f65292af0ee4d8803c0bee2acddaabad6e12 100755 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc @@ -35,6 +35,8 @@ a = nullptr; \ } +bool IMAGE2D_OPEN = true; + namespace mindspore { class TestConvolutionDwOpenCL : public mindspore::Common { public: @@ -95,6 +97,18 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * std::vector<kernel::LiteKernel *> kernels{pKernel}; std::vector<lite::tensor::Tensor *> inputs_{tensor_a}; + size_t C4 = UP_DIV(inputs[0]->Channel(), C4NUM); + // if (IMAGE2D_OPEN && format == schema::Format_NHWC4) { + // std::vector<size_t> img_size{inputs[0]->Width() * C4, (size_t)inputs[0]->Height(), CL_FLOAT}; + // auto in_data = allocator->Malloc(inputs[0]->Size(), img_size); + // inputs[0]->SetData(in_data); + // } else if (IMAGE2D_OPEN && format == schema::Format_NC4HW4) { + // std::vector<size_t> img_size{(size_t)inputs[0]->Width(), inputs[0]->Height() * C4, CL_FLOAT}; + // auto in_data = allocator->Malloc(inputs[0]->Size(), img_size); + // inputs[0]->SetData(in_data); + // } else { + inputs[0]->MallocData(allocator); + // } auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs_, outputs, kernels, kernels, kernels); pGraph->Init(); @@ -103,9 +117,9 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * pGraph->Run(); if (is_compare) { - float* packed_output = reinterpret_cast<float *>(outputs[0]->Data()); - float *packed_correct_data = new float[packed_output_size]; - memset(packed_correct_data, 0, packed_output_size * sizeof(float)); + float_t* packed_output = reinterpret_cast<float *>(outputs[0]->Data()); + float_t *packed_correct_data = new float_t[packed_output_size]; + memset(packed_correct_data, 0, packed_output_size * sizeof(float_t)); if (format == schema::Format_NC4HW4) { PackNHWCToNC4HW4Fp32(gnd_data, packed_correct_data, conv_param->output_batch_, conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_); @@ -128,7 +142,7 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * std::cout << std::endl; printf("==================output data=================\n"); std::cout << std::endl; - for (int i = 0; i < packed_output_size; i++) { + for (int i = 0; i < 80/*packed_output_size*/; i++) { std::cout << packed_output[i] << ", "; } std::cout << std::endl; @@ -142,13 +156,13 @@ void DepthWiseTestMain(ConvParameter *conv_param, float_t *input_data, float_t * SAFE_DELETE_ARRAY(packed_correct_data) } + inputs[1]->SetData(nullptr); + inputs[2]->SetData(nullptr); SAFE_DELETE_ARRAY(packed_input); for (auto tensor : inputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } for (auto tensor : outputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } SAFE_DELETE_PTR(pKernel) @@ -477,6 +491,7 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { std::vector<kernel::LiteKernel *> kernels{pKernel}; std::vector<lite::tensor::Tensor *> inputs_{tensor_a}; + inputs[0]->MallocData(); auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs_, outputs, kernels, kernels, kernels); pGraph->Init(); @@ -516,12 +531,12 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) { // compare Common::CompareOutputData(packed_output, packed_correct_data, packed_output_size, 0.00001); + inputs[1]->SetData(nullptr); + inputs[2]->SetData(nullptr); for (auto tensor : inputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } for (auto tensor : outputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } SAFE_DELETE_PTR(pKernel) @@ -640,6 +655,7 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { std::vector<kernel::LiteKernel *> kernels{pKernel}; std::vector<lite::tensor::Tensor *> inputs_{tensor_a}; + inputs[0]->MallocData(); auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs_, outputs, kernels, kernels, kernels); pGraph->Init(); @@ -687,14 +703,14 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) { // compare Common::CompareOutputData(packed_output, packed_correct_data, packed_output_size, 0.00001); + inputs[1]->SetData(nullptr); + inputs[2]->SetData(nullptr); SAFE_DELETE_ARRAY(packed_input); SAFE_DELETE_ARRAY(packed_correct_data) for (auto tensor : inputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } for (auto tensor : outputs) { - tensor->SetData(nullptr); SAFE_DELETE_PTR(tensor) } SAFE_DELETE_PTR(pKernel) @@ -742,35 +758,27 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { }; // nhwc - float_t *input_data = new float_t[96*112*112]{ - 0.5488135 , 0.3834415 , 0.77815676, 0.9446689 , 0.6120957 , - 0.71518934, 0.79172504, 0.87001216, 0.5218483 , 0.616934 , - 0.60276335, 0.5288949 , 0.9786183 , 0.41466194, 0.94374806, - 0.5448832 , 0.56804454, 0.7991586 , 0.2645556 , 0.6818203 , - 0.4236548 , 0.92559665, 0.46147937, 0.7742337 , 0.3595079 , - 0.6458941 , 0.07103606, 0.7805292 , 0.45615032, 0.43703195, - 0.4375872 , 0.0871293 , 0.11827443, 0.56843394, 0.6976312 , - 0.891773 , 0.0202184 , 0.639921 , 0.0187898 , 0.06022547, - 0.96366274, 0.83261985, 0.14335328, 0.6176355 , 0.6667667 }; + size_t in_size = 96*112*112; + float_t *input_data = new float_t[in_size]; + memset(input_data, 0, in_size); + for (auto i = 0; i < in_size; ++i) { + input_data[i] = 1; + } // co h w ci - float_t *weight_data = new float_t[576*3*3]{ - 0.67063785, 0.21038257, 0.12892629, - 0.31542835, 0.36371076, 0.57019675, - 0.43860152, 0.9883738 , 0.10204481, - 0.20887676, 0.16130951, 0.6531083 , - 0.2532916 , 0.46631077, 0.2444256 , - 0.15896958, 0.11037514, 0.6563296 , - 0.13818295, 0.19658236, 0.36872518, - 0.82099324, 0.09710128, 0.8379449 , - 0.09609841, 0.97645944, 0.4686512 , - 0.9767611 , 0.6048455 , 0.7392636 , - 0.03918779, 0.28280696, 0.12019656, - 0.2961402 , 0.11872772, 0.31798318, - 0.41426298, 0.06414749, 0.6924721 , - 0.56660146, 0.2653895 , 0.5232481 , - 0.09394051, 0.5759465 , 0.9292962 }; + size_t wt_size = 576*3*3; + float_t *weight_data = new float_t[wt_size]; + memset(weight_data, 0, wt_size); + for (auto i = 0; i < wt_size; ++i) { + weight_data[i] = 1; + } + size_t out_size = 96*112*112; + float_t *gnd_data = new float_t[out_size]; + memset(gnd_data, 0, out_size); +// for (auto i = 0; i < in_size; ++i) { +// gnd_data[i] = 1; +// } for (size_t i = 0; i < src_shape.size(); ++i) { - const int MAX_RUN_TIMES = 10; + const int MAX_RUN_TIMES = 1; for (int j = 0; j < MAX_RUN_TIMES; ++j) { printf("========profiling depthwise, in shape(%d,%d,%d,%d), out shape(%d,%d,%d,%d), iter%d========\n", src_shape[i][0], src_shape[i][1], src_shape[i][2], src_shape[i][3], @@ -794,8 +802,8 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { conv_param->dilation_h_ = 1; conv_param->dilation_w_ = 1; } - DepthWiseTestMain(conv_param, input_data, weight_data, nullptr, schema::Format_NC4HW4, false); - // DepthWiseTestMain(conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4, false); +// DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, false); + DepthWiseTestMain(conv_param, input_data, weight_data, nullptr, schema::Format_NHWC4, false); } } SAFE_DELETE_ARRAY(input_data); @@ -803,4 +811,54 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) { lite::opencl::OpenCLRuntime::DeleteInstance(); } +TEST_F(TestConvolutionDwOpenCL, Buffer2Image) { + std::vector<int> src_shape{1, 96, 64, 64}; + std::vector<int> dst_shape{1, 96, 32, 32}; + std::vector<int> filter_shape{96, 3, 3, 1}; + + // nhwc + size_t in_size = 96*112*112; + float_t *input_data = new float_t[in_size]; + memset(input_data, 0, in_size); + for (auto i = 0; i < in_size; ++i) { + input_data[i] = 1; + } + // co h w ci + size_t wt_size = 576*3*3; + float_t *weight_data = new float_t[wt_size]; + memset(weight_data, 0, wt_size); + for (auto i = 0; i < wt_size; ++i) { + weight_data[i] = 1; + } + size_t out_size = 96*112*112; + float_t *gnd_data = new float_t[out_size]; + memset(gnd_data, 0, out_size); +// for (auto i = 0; i < in_size; ++i) { +// gnd_data[i] = 1; +// } + ConvParameter *conv_param = new ConvParameter(); + { + conv_param->input_batch_ = 1; + conv_param->input_h_ = src_shape[2]; + conv_param->input_w_ = src_shape[3]; + conv_param->input_channel_ = src_shape[1]; + conv_param->output_batch_ = 1; + conv_param->output_h_ = dst_shape[2]; + conv_param->output_w_ = dst_shape[3]; + conv_param->output_channel_ = dst_shape[1]; + conv_param->kernel_h_ = filter_shape[1]; + conv_param->kernel_w_ = filter_shape[2]; + conv_param->stride_h_ = conv_param->output_h_/conv_param->input_h_; + conv_param->stride_w_ = conv_param->output_w_/conv_param->input_w_; + conv_param->pad_h_ = (conv_param->kernel_h_-1)/2; + conv_param->pad_w_ = (conv_param->kernel_w_-1)/2; + conv_param->dilation_h_ = 1; + conv_param->dilation_w_ = 1; + } +// DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NC4HW4, true); + DepthWiseTestMain(conv_param, input_data, weight_data, gnd_data, schema::Format_NHWC4, true); + SAFE_DELETE_ARRAY(input_data); + SAFE_DELETE_ARRAY(weight_data); + lite::opencl::OpenCLRuntime::DeleteInstance(); +} } // namespace mindspore