From 88120708f7bf6da6d38803f910c27ec559fcdfaa Mon Sep 17 00:00:00 2001 From: liuqi Date: Sat, 28 Jul 2018 18:15:18 +0800 Subject: [PATCH] Add gpu avalibility check and return status to user if gpu call failed. --- mace/core/file_storage.cc | 4 +- mace/core/runtime/opencl/opencl_allocator.cc | 15 +- mace/core/runtime/opencl/opencl_runtime.cc | 133 ++++- mace/core/runtime/opencl/opencl_runtime.h | 27 +- mace/core/runtime/opencl/opencl_wrapper.cc | 480 ++++++++++++------ mace/core/workspace.cc | 42 +- mace/examples/cli/example.cc | 5 +- mace/kernels/opencl/activation.cc | 6 +- mace/kernels/opencl/addn.cc | 6 +- mace/kernels/opencl/batch_norm.cc | 6 +- mace/kernels/opencl/bias_add.cc | 5 +- mace/kernels/opencl/buffer_to_image.cc | 8 +- mace/kernels/opencl/channel_shuffle.cc | 8 +- mace/kernels/opencl/concat.cc | 29 +- mace/kernels/opencl/conv_2d_1x1.cc | 54 +- mace/kernels/opencl/conv_2d_3x3.cc | 47 +- mace/kernels/opencl/conv_2d_general.cc | 55 +- mace/kernels/opencl/crop.cc | 24 +- mace/kernels/opencl/deconv_2d_opencl.cc | 6 +- mace/kernels/opencl/depth_to_space.cc | 9 +- mace/kernels/opencl/depthwise_conv.cc | 50 +- mace/kernels/opencl/eltwise.cc | 6 +- mace/kernels/opencl/fully_connected.cc | 14 +- mace/kernels/opencl/helper.cc | 65 ++- mace/kernels/opencl/helper.h | 22 +- mace/kernels/opencl/image_to_buffer.cc | 10 +- mace/kernels/opencl/matmul.cc | 6 +- .../kernels/opencl/out_of_range_check_test.cc | 15 +- mace/kernels/opencl/pad.cc | 6 +- mace/kernels/opencl/pooling.cc | 35 +- mace/kernels/opencl/reduce_mean_opencl.cc | 12 +- mace/kernels/opencl/resize_bilinear.cc | 47 +- mace/kernels/opencl/slice.cc | 7 +- mace/kernels/opencl/softmax.cc | 30 +- mace/kernels/opencl/space_to_batch.cc | 9 +- mace/kernels/opencl/winograd_transform.cc | 18 +- mace/libmace/mace.cc | 44 ++ mace/proto/mace.proto | 11 +- mace/public/mace_runtime.h | 127 +++-- mace/python/tools/convert_util.py | 65 +++ mace/python/tools/converter.py | 7 + .../tools/converter_tool/base_converter.py | 1 + .../tools/converter_tool/transformer.py | 32 +- mace/python/tools/memory_optimizer.py | 108 ++-- mace/python/tools/model.jinja2 | 1 + mace/test/mace_api_mt_test.cc | 2 + mace/test/mace_api_test.cc | 2 + 47 files changed, 1161 insertions(+), 560 deletions(-) diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc index 99af2c19..99731a81 100644 --- a/mace/core/file_storage.cc +++ b/mace/core/file_storage.cc @@ -37,8 +37,8 @@ int FileStorage::Load() { struct stat st; if (stat(file_path_.c_str(), &st) == -1) { if (errno == ENOENT) { - LOG(INFO) << "File " << file_path_ - << " does not exist"; + VLOG(1) << "File " << file_path_ + << " does not exist"; return 0; } else { LOG(WARNING) << "Stat file " << file_path_ diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index ad5c8eac..7dda80e6 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { void *mapped_ptr = queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset, nbytes, nullptr, nullptr, &error); - MACE_CHECK_CL_SUCCESS(error); + if (error != CL_SUCCESS) { + LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error); + mapped_ptr = nullptr; + } return mapped_ptr; } @@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer, *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr, nullptr, &error); - MACE_CHECK_CL_SUCCESS(error); - + if (error != CL_SUCCESS) { + LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error); + mapped_ptr = nullptr; + } return mapped_ptr; } @@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { auto queue = OpenCLRuntime::Global()->command_queue(); cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr, nullptr); - MACE_CHECK_CL_SUCCESS(error); + if (error != CL_SUCCESS) { + LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error); + } } bool OpenCLAllocator::OnHost() const { return false; } diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index c5809ffd..f901973f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath( OpenCLRuntime::OpenCLRuntime(): precompiled_binary_storage_(nullptr), cache_storage_(nullptr), - is_profiling_enabled_(false) { + is_opencl_avaliable_(false), + is_profiling_enabled_(false), + opencl_version_(CL_VER_UNKNOWN), + gpu_type_(UNKNOWN) { std::vector all_platforms; cl::Platform::get(&all_platforms); if (all_platforms.size() == 0) { - LOG(FATAL) << "No OpenCL platforms found"; + LOG(ERROR) << "No OpenCL platforms found"; + return; } cl::Platform default_platform = all_platforms[0]; std::stringstream ss; @@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime(): std::vector all_devices; default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); if (all_devices.size() == 0) { - LOG(FATAL) << "No OpenCL devices found"; + LOG(ERROR) << "No OpenCL devices found"; + return; } bool gpu_detected = false; @@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime(): const std::string device_version = device.getInfo(); opencl_version_ = ParseDeviceVersion(device_version); + if (opencl_version_ == OpenCLVersion::CL_VER_UNKNOWN) { + return; + } VLOG(1) << "Using device: " << device_name; break; } } if (!gpu_detected) { - LOG(FATAL) << "No GPU device found"; + LOG(ERROR) << "No GPU device found"; + return; } cl_command_queue_properties properties = 0; @@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime(): new cl::Context({*device_}, nullptr, nullptr, nullptr, &err)); } } - MACE_CHECK_CL_SUCCESS(err); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return; + } command_queue_ = std::make_shared(*context_, *device_, properties, &err); - MACE_CHECK_CL_SUCCESS(err); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return; + } extern std::shared_ptr kStorageFactory; std::string cached_binary_platform_info; @@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime(): } if (cached_binary_platform_info != platform_info_) { - if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { - LOG(WARNING) << "There is no precompiled OpenCL binary in" - " all OpenCL binary paths"; - } else { + if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) { precompiled_binary_storage_.reset( new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); if (precompiled_binary_storage_->Load() != 0) { @@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime(): } else { this->out_of_range_check_ = false; } + + is_opencl_avaliable_ = true; } OpenCLRuntime::~OpenCLRuntime() { @@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() { device_.reset(); } +bool OpenCLRuntime::is_opencl_avaliable() { + static const uint64_t kMinWorkGroupSize = 64; + return is_opencl_avaliable_ + && GetDeviceMaxWorkGroupSize() >= kMinWorkGroupSize; +} + cl::Context &OpenCLRuntime::context() { return *context_; } cl::Device &OpenCLRuntime::device() { return *device_; } @@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( return true; } -void OpenCLRuntime::BuildProgramFromSource( +bool OpenCLRuntime::BuildProgramFromSource( const std::string &program_name, const std::string &built_program_key, const std::string &build_options_str, @@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource( LOG(WARNING) << "Build program " << program_name << " from source failed: " << MakeString(ret); - return; + return false; } // Keep built program binary @@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource( cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * device_list_size, program_binary_sizes.get(), nullptr); - MACE_CHECK_CL_SUCCESS(err); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return false; + } std::unique_ptr[]> program_binaries( new std::unique_ptr[device_list_size]); for (cl_uint i = 0; i < device_list_size; ++i) { @@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource( err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES, sizeof(unsigned char *) * device_list_size, program_binaries.get(), nullptr); - MACE_CHECK_CL_SUCCESS(err); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return false; + } std::vector content( reinterpret_cast(program_binaries[0].get()), reinterpret_cast(program_binaries[0].get()) + @@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource( VLOG(3) << "Program from source: " << built_program_key; } + return true; } -void OpenCLRuntime::BuildProgram(const std::string &program_name, +bool OpenCLRuntime::BuildProgram(const std::string &program_name, const std::string &built_program_key, const std::string &build_options, cl::Program *program) { @@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ret = BuildProgramFromPrecompiledBinary(built_program_key, build_options_str, program); if (!ret) { - BuildProgramFromSource(program_name, built_program_key, - build_options_str, program); + ret = BuildProgramFromSource(program_name, built_program_key, + build_options_str, program); } } + return ret; } -cl::Kernel OpenCLRuntime::BuildKernel( +MaceStatus OpenCLRuntime::BuildKernel( const std::string &program_name, const std::string &kernel_name, - const std::set &build_options) { + const std::set &build_options, + cl::Kernel *kernel) { std::string build_options_str; for (auto &option : build_options) { build_options_str += " " + option; @@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel( if (built_program_it != built_program_map_.end()) { program = built_program_it->second; } else { - this->BuildProgram(program_name, built_program_key, build_options_str, - &program); + bool ret = this->BuildProgram(program_name, built_program_key, + build_options_str, &program); + if (!ret) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } built_program_map_.emplace(built_program_key, program); } - return cl::Kernel(program, kernel_name.c_str()); + cl_int err; + *kernel = cl::Kernel(program, kernel_name.c_str(), &err); + MACE_CL_RET_STATUS(err); + return MaceStatus::MACE_SUCCESS; } void OpenCLRuntime::SaveBuiltCLProgram() { @@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) { uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { uint64_t size = 0; - device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); + cl_int err = device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + size = 0; + } return size; } uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() { uint64_t size = 0; - device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); + cl_int err = device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + size = 0; + } return size; } +bool OpenCLRuntime::IsImageSupport() { + cl_bool res; + cl_int err = device_->getInfo(CL_DEVICE_IMAGE_SUPPORT, &res); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return false; + } + return res == CL_TRUE; +} +std::vector OpenCLRuntime::GetMaxImage2DSize() { + size_t max_height, max_width; + cl_int err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return {}; + } + err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + return {}; + } + return {max_height, max_width}; +} + uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t size = 0; - kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size); + cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, + &size); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + size = 0; + } return size; } uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { uint64_t size = 0; - kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size); + cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, + &size); + if (err != CL_SUCCESS) { + LOG(ERROR) << "error: " << OpenCLErrorToString(err); + size = 0; + } return size; } @@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion( } else if (words[1] == "1.0") { return OpenCLVersion::CL_VER_1_0; } else { - LOG(FATAL) << "Do not support OpenCL version: " << words[1]; - return OpenCLVersion::CL_VER_1_0; + LOG(ERROR) << "Do not support OpenCL version: " << words[1]; + return OpenCLVersion::CL_VER_UNKNOWN; } } diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index f87b4580..537707fa 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -42,13 +42,23 @@ enum OpenCLVersion { CL_VER_1_1, CL_VER_1_2, CL_VER_2_0, + CL_VER_UNKNOWN, }; const std::string OpenCLErrorToString(cl_int error); -#define MACE_CHECK_CL_SUCCESS(error) \ - MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error) +#define MACE_CL_RET_ERROR(error) \ + if (error != CL_SUCCESS) { \ + LOG(ERROR) << "error: " << OpenCLErrorToString(error); \ + return error; \ + } + +#define MACE_CL_RET_STATUS(error) \ + if (error != CL_SUCCESS) { \ + LOG(ERROR) << "error: " << OpenCLErrorToString(error); \ + return MaceStatus::MACE_OUT_OF_RESOURCES; \ + } class OpenCLProfilingTimer : public Timer { public: @@ -81,19 +91,23 @@ class OpenCLRuntime { const std::string platform_info() const; uint64_t device_global_mem_cache_size() const; uint32_t device_compute_units() const; + bool is_opencl_avaliable(); void GetCallStats(const cl::Event &event, CallStats *stats); uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxMemAllocSize(); + bool IsImageSupport(); + std::vector GetMaxImage2DSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); bool IsNonUniformWorkgroupsSupported() const; bool IsOutOfRangeCheckEnabled() const; bool is_profiling_enabled() const; - cl::Kernel BuildKernel(const std::string &program_name, + MaceStatus BuildKernel(const std::string &program_name, const std::string &kernel_name, - const std::set &build_options); + const std::set &build_options, + cl::Kernel *kernel); void SaveBuiltCLProgram(); @@ -103,7 +117,7 @@ class OpenCLRuntime { OpenCLRuntime(const OpenCLRuntime &) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; - void BuildProgram(const std::string &program_file_name, + bool BuildProgram(const std::string &program_file_name, const std::string &binary_file_name, const std::string &build_options, cl::Program *program); @@ -115,7 +129,7 @@ class OpenCLRuntime { const std::string &built_program_key, const std::string &build_options_str, cl::Program *program); - void BuildProgramFromSource( + bool BuildProgramFromSource( const std::string &program_name, const std::string &built_program_key, const std::string &build_options_str, @@ -125,6 +139,7 @@ class OpenCLRuntime { private: std::unique_ptr precompiled_binary_storage_; std::unique_ptr cache_storage_; + bool is_opencl_avaliable_; bool is_profiling_enabled_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index 26a9fb7e..a7fd447e 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -257,6 +257,10 @@ OpenCLLibrary *OpenCLLibrary::Get() { OpenCLLibrary::OpenCLLibrary() { this->Load(); + // Do not call dlclose which may unload all OpenCL symbols. + // If close the OpenCL library, the static OpenCLRuntime destructor may fail. + // If there is no dlclose, the library will be closed when the program exist. + // Besides, the library will not be load repeatedly even dlopen many times. } bool OpenCLLibrary::Load() { @@ -293,13 +297,12 @@ bool OpenCLLibrary::Load() { } if (handle_ == nullptr) { - LOG(FATAL) << "Failed to load OpenCL library, " + LOG(ERROR) << "Failed to load OpenCL library, " "please make sure there exists OpenCL library on your device, " "and your APP have right to access the library."; return false; } - // Do not dlclose, leave it to system. return true; } @@ -384,9 +387,12 @@ CL_API_ENTRY cl_int clGetPlatformIDs(cl_uint num_entries, cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetPlatformIDs; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetPlatformIDs"); - return func(num_entries, platforms, num_platforms); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetPlatformIDs"); + return func(num_entries, platforms, num_platforms); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetPlatformInfo(cl_platform_id platform, @@ -396,10 +402,13 @@ CL_API_ENTRY cl_int clGetPlatformInfo(cl_platform_id platform, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetPlatformInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetPlatformInfo"); - return func(platform, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetPlatformInfo"); + return func(platform, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } // Device APIs @@ -410,9 +419,12 @@ CL_API_ENTRY cl_int clGetDeviceIDs(cl_platform_id platform, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetDeviceIDs; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetDeviceIDs"); - return func(platform, device_type, num_entries, devices, num_devices); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetDeviceIDs"); + return func(platform, device_type, num_entries, devices, num_devices); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetDeviceInfo(cl_device_id device, @@ -422,26 +434,35 @@ CL_API_ENTRY cl_int clGetDeviceInfo(cl_device_id device, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetDeviceInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetDeviceInfo"); - return func(device, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetDeviceInfo"); + return func(device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainDevice; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainDevice"); - return func(device); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainDevice"); + return func(device); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseDevice; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseDevice"); - return func(device); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseDevice"); + return func(device); + } else { + return CL_INVALID_PLATFORM; + } } // Context APIs @@ -453,10 +474,14 @@ CL_API_ENTRY cl_context clCreateContext( void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateContext; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateContext"); - return func(properties, num_devices, devices, pfn_notify, user_data, - errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateContext"); + return func(properties, num_devices, devices, pfn_notify, user_data, + errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_context clCreateContextFromType( @@ -466,25 +491,35 @@ CL_API_ENTRY cl_context clCreateContextFromType( void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateContextFromType; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateContextFromType"); - return func(properties, device_type, pfn_notify, user_data, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateContextFromType"); + return func(properties, device_type, pfn_notify, user_data, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainContext; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainContext"); - return func(context); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainContext"); + return func(context); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseContext; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseContext"); - return func(context); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseContext"); + return func(context); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetContextInfo(cl_context context, @@ -494,10 +529,13 @@ CL_API_ENTRY cl_int clGetContextInfo(cl_context context, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetContextInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetContextInfo"); - return func(context, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetContextInfo"); + return func(context, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } // Program Object APIs @@ -508,9 +546,13 @@ CL_API_ENTRY cl_program clCreateProgramWithSource(cl_context context, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateProgramWithSource; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateProgramWithSource"); - return func(context, count, strings, lengths, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateProgramWithSource"); + return func(context, count, strings, lengths, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_program @@ -522,10 +564,14 @@ clCreateProgramWithBinary(cl_context context, cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateProgramWithBinary; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateProgramWithBinary"); - return func(context, num_devices, device_list, lengths, binaries, - binary_status, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateProgramWithBinary"); + return func(context, num_devices, device_list, lengths, binaries, + binary_status, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clGetProgramInfo(cl_program program, @@ -535,10 +581,13 @@ CL_API_ENTRY cl_int clGetProgramInfo(cl_program program, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetProgramInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetProgramInfo"); - return func(program, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetProgramInfo"); + return func(program, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetProgramBuildInfo(cl_program program, @@ -549,26 +598,35 @@ CL_API_ENTRY cl_int clGetProgramBuildInfo(cl_program program, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetProgramBuildInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetProgramBuildInfo"); - return func(program, device, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetProgramBuildInfo"); + return func(program, device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainProgram; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainProgram"); - return func(program); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainProgram"); + return func(program); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseProgram; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseProgram"); - return func(program); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseProgram"); + return func(program); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clBuildProgram( @@ -579,10 +637,13 @@ CL_API_ENTRY cl_int clBuildProgram( void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clBuildProgram; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clBuildProgram"); - return func(program, num_devices, device_list, options, pfn_notify, - user_data); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clBuildProgram"); + return func(program, num_devices, device_list, options, pfn_notify, + user_data); + } else { + return CL_INVALID_PLATFORM; + } } // Kernel Object APIs @@ -591,25 +652,35 @@ CL_API_ENTRY cl_kernel clCreateKernel(cl_program program, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateKernel; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateKernel"); - return func(program, kernel_name, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateKernel"); + return func(program, kernel_name, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainKernel; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainKernel"); - return func(kernel); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainKernel"); + return func(kernel); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseKernel; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseKernel"); - return func(kernel); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseKernel"); + return func(kernel); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel, @@ -618,9 +689,12 @@ CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel, const void *arg_value) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clSetKernelArg; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clSetKernelArg"); - return func(kernel, arg_index, arg_size, arg_value); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clSetKernelArg"); + return func(kernel, arg_index, arg_size, arg_value); + } else { + return CL_INVALID_PLATFORM; + } } // Memory Object APIs @@ -631,9 +705,13 @@ CL_API_ENTRY cl_mem clCreateBuffer(cl_context context, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateBuffer; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateBuffer"); - return func(context, flags, size, host_ptr, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateBuffer"); + return func(context, flags, size, host_ptr, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_mem clCreateImage(cl_context context, @@ -644,25 +722,40 @@ CL_API_ENTRY cl_mem clCreateImage(cl_context context, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateImage; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateImage"); - return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateImage"); + return func(context, + flags, + image_format, + image_desc, + host_ptr, + errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainMemObject; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainMemObject"); - return func(memobj); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainMemObject"); + return func(memobj); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseMemObject; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseMemObject"); - return func(memobj); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseMemObject"); + return func(memobj); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetImageInfo(cl_mem image, @@ -672,10 +765,13 @@ CL_API_ENTRY cl_int clGetImageInfo(cl_mem image, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetImageInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetImageInfo"); - return func(image, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetImageInfo"); + return func(image, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } // Command Queue APIs @@ -686,25 +782,35 @@ CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties( cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateCommandQueueWithProperties; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateCommandQueueWithProperties"); - return func(context, device, properties, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateCommandQueueWithProperties"); + return func(context, device, properties, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainCommandQueue; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainCommandQueue"); - return func(command_queue); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainCommandQueue"); + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseCommandQueue; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseCommandQueue"); - return func(command_queue); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseCommandQueue"); + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } } // Enqueued Commands APIs @@ -719,10 +825,13 @@ CL_API_ENTRY cl_int clEnqueueReadBuffer(cl_command_queue command_queue, cl_event *event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueReadBuffer; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueReadBuffer"); - return func(command_queue, buffer, blocking_read, offset, size, ptr, - num_events_in_wait_list, event_wait_list, event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueReadBuffer"); + return func(command_queue, buffer, blocking_read, offset, size, ptr, + num_events_in_wait_list, event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, @@ -736,10 +845,13 @@ CL_API_ENTRY cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, cl_event *event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueWriteBuffer; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueWriteBuffer"); - return func(command_queue, buffer, blocking_write, offset, size, ptr, - num_events_in_wait_list, event_wait_list, event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueWriteBuffer"); + return func(command_queue, buffer, blocking_write, offset, size, ptr, + num_events_in_wait_list, event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY void *clEnqueueMapBuffer(cl_command_queue command_queue, @@ -754,10 +866,14 @@ CL_API_ENTRY void *clEnqueueMapBuffer(cl_command_queue command_queue, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueMapBuffer; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueMapBuffer"); - return func(command_queue, buffer, blocking_map, map_flags, offset, size, - num_events_in_wait_list, event_wait_list, event, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueMapBuffer"); + return func(command_queue, buffer, blocking_map, map_flags, offset, size, + num_events_in_wait_list, event_wait_list, event, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY void *clEnqueueMapImage(cl_command_queue command_queue, @@ -774,11 +890,15 @@ CL_API_ENTRY void *clEnqueueMapImage(cl_command_queue command_queue, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueMapImage; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueMapImage"); - return func(command_queue, image, blocking_map, map_flags, origin, region, - image_row_pitch, image_slice_pitch, num_events_in_wait_list, - event_wait_list, event, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueMapImage"); + return func(command_queue, image, blocking_map, map_flags, origin, region, + image_row_pitch, image_slice_pitch, num_events_in_wait_list, + event_wait_list, event, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } CL_API_ENTRY cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, @@ -789,10 +909,13 @@ CL_API_ENTRY cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_event *event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueUnmapMemObject; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueUnmapMemObject"); - return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, - event_wait_list, event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueUnmapMemObject"); + return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, + event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clGetKernelWorkGroupInfo( @@ -803,10 +926,13 @@ CL_API_ENTRY cl_int clGetKernelWorkGroupInfo( void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetKernelWorkGroupInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetKernelWorkGroupInfo"); - return func(kernel, device, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetKernelWorkGroupInfo"); + return func(kernel, device, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, @@ -820,34 +946,46 @@ CL_API_ENTRY cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_event *event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clEnqueueNDRangeKernel; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clEnqueueNDRangeKernel"); - return func(command_queue, kernel, work_dim, global_work_offset, - global_work_size, local_work_size, num_events_in_wait_list, - event_wait_list, event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clEnqueueNDRangeKernel"); + return func(command_queue, kernel, work_dim, global_work_offset, + global_work_size, local_work_size, num_events_in_wait_list, + event_wait_list, event); + } else { + return CL_INVALID_PLATFORM; + } } // Event Object APIs CL_API_ENTRY cl_int clWaitForEvents( cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clWaitForEvents; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clWaitForEvents"); - return func(num_events, event_list); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clWaitForEvents"); + return func(num_events, event_list); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clRetainEvent; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clRetainEvent"); - return func(event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clRetainEvent"); + return func(event); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clReleaseEvent; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clReleaseEvent"); - return func(event); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clReleaseEvent"); + return func(event); + } else { + return CL_INVALID_PLATFORM; + } } // Event API @@ -858,10 +996,13 @@ CL_API_ENTRY cl_int clGetEventInfo(cl_event event, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetEventInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetEventInfo"); - return func(event, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetEventInfo"); + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } // Profiling APIs @@ -872,27 +1013,36 @@ CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clGetEventProfilingInfo; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clGetEventProfilingInfo"); - return func(event, param_name, param_value_size, param_value, - param_value_size_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clGetEventProfilingInfo"); + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_INVALID_PLATFORM; + } } // Flush and Finish APIs CL_API_ENTRY cl_int clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clFlush; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clFlush"); - return func(command_queue); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clFlush"); + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } } CL_API_ENTRY cl_int clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 { auto func = mace::runtime::OpenCLLibrary::Get()->clFinish; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clFinish"); - return func(command_queue); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clFinish"); + return func(command_queue); + } else { + return CL_INVALID_PLATFORM; + } } // Deprecated OpenCL 1.1 APIs @@ -906,10 +1056,14 @@ CL_API_ENTRY /* CL_EXT_PREFIX__VERSION_1_1_DEPRECATED */ cl_mem clCreateImage2D( void *host_ptr, cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED */ { auto func = mace::runtime::OpenCLLibrary::Get()->clCreateImage2D; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateImage2D"); - return func(context, flags, image_format, image_width, image_height, - image_row_pitch, host_ptr, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateImage2D"); + return func(context, flags, image_format, image_width, image_height, + image_row_pitch, host_ptr, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } // Deprecated OpenCL 2.0 APIs @@ -920,7 +1074,11 @@ clCreateCommandQueue(cl_context context, cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED */ { // NOLINT auto func = mace::runtime::OpenCLLibrary::Get()->clCreateCommandQueue; - MACE_CHECK_NOTNULL(func); - MACE_LATENCY_LOGGER(3, "clCreateCommandQueue"); - return func(context, device, properties, errcode_ret); + if (func != nullptr) { + MACE_LATENCY_LOGGER(3, "clCreateCommandQueue"); + return func(context, device, properties, errcode_ret); + } else { + if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM; + return nullptr; + } } diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index cb6be4ec..1e8826cd 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -204,26 +204,28 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, // TODO(liyin): memory block should not have concept of type, but to be // consistent with gpu, all memory block use float/half as unit for (auto &mem_block : net_def.mem_arena().mem_block()) { - if (device_type == DeviceType::GPU) { - // TODO(liuqi): refactor based on PB - if (mem_block.mem_id() >= 20000) { - std::unique_ptr image_buf( - new Image()); - MACE_RETURN_IF_ERROR(image_buf->Allocate( - {mem_block.x(), mem_block.y()}, dtype)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(image_buf)); - } - } else { - if (mem_block.mem_id() < 20000) { - std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(device_type))); - MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype) - + MACE_EXTRA_BUFFER_PAD_SIZE)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(tensor_buf)); - } + if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(DeviceType::CPU))); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() * GetEnumTypeSize(dtype) + + MACE_EXTRA_BUFFER_PAD_SIZE)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { + std::unique_ptr image_buf( + new Image()); + MACE_RETURN_IF_ERROR(image_buf->Allocate( + {mem_block.x(), mem_block.y()}, dtype)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(image_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(GetDeviceAllocator(DeviceType::GPU))); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() * GetEnumTypeSize(dtype))); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); } } VLOG(3) << "Preallocate buffer to tensors"; diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 2270f295..ec93674e 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -219,7 +219,10 @@ bool RunModel(const std::vector &input_names, #endif if (create_engine_status != MaceStatus::MACE_SUCCESS) { - std::cerr << "Create engine error, please check the arguments" << std::endl; + std::cerr << "Create engine error, please check the arguments first, " + << "if correct, the device may not run the model, " + << "please fall back to other strategy." + << std::endl; exit(1); } diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index edb67873..2e343aa9 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -79,7 +79,8 @@ MaceStatus ActivationFunctor::operator()( default: LOG(FATAL) << "Unknown activation type: " << activation_; } - kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -115,7 +116,8 @@ MaceStatus ActivationFunctor::operator()( std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, + lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index d7270a6e..e47f5103 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -68,7 +68,8 @@ MaceStatus AddNFunctor::operator()( built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -111,7 +112,8 @@ MaceStatus AddNFunctor::operator()( std::string tuning_key = Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3)); - TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index 2769e08b..d2dce6d3 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor::operator()( LOG(FATAL) << "Unknown activation type: " << activation_; } - kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor::operator()( std::string tuning_key = Concat("batch_norm_opencl_kernel", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index d1e58bd3..b7023dd5 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index b29f7e81..df104d66 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor::operator()( } } - auto b2f_kernel = runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, built_options); + cl::Kernel b2f_kernel; + + MACE_RETURN_IF_ERROR(runtime->BuildKernel( + "buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel)); uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor::operator()( b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 63e47267..8babf338 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = - runtime->BuildKernel("channel_shuffle", kernel_name, built_options); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("channel_shuffle", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor::operator()( std::string tuning_key = Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 65540e48..98ac4342 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -24,12 +24,18 @@ namespace kernels { namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = std::min(base, kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::max(std::min(base, kwg_size / lws_size), 1); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = std::min(base, kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = + std::max(std::min(base, kwg_size / lws_size), 1); + } return lws; } @@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, if (input0->dim(3) % 4 == 0) { built_options.emplace("-DDIVISIBLE_FOUR"); } - *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, std::string tuning_key = Concat("concat_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); @@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } @@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); char *kerror_code = (*kernel_error)->mutable_data(); diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index 48ea04d3..e5eb2134 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -27,30 +27,36 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t lws_limit = 128; std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); - const uint32_t base = - std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= base) { - lws[0] = std::min(gws[0], base); - } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) { - lws[0] = std::min(gws[0], base); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; } else { - lws[0] = gws[0] / 8; - if (lws[0] < base) { - lws[0] = std::max(gws[0] / 4, base); + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + const uint32_t base = + std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] < base) { + lws[0] = std::max(gws[0] / 4, base); + } } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + (cache_size / kernel_cache_size / lws_size / compute_units) * 8, + gws[2]); + if (lws[2] == 0) { + lws[2] = std::min(gws[2], base); + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::min( - (cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]); - if (lws[2] == 0) { - lws[2] = std::min(gws[2], base); - } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); return lws; } @@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, std::string tuning_key = Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 0fa295d0..9984fe10 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -26,25 +26,30 @@ namespace { const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = std::max( - OpenCLRuntime::Global()->device_compute_units() / 2, 1); - const uint32_t base = - std::max( - std::min(cache_size / kBaseGPUMemCacheSize, 4), 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = - std::min(std::min(gws[0], base), kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::min( - RoundUp( - cache_size / kernel_cache_size / lws_size / compute_units, base), - gws[2]); - if (lws[2] == 0) { - lws[2] = std::min(gws[2], base); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = std::max( + OpenCLRuntime::Global()->device_compute_units() / 2, 1); + const uint32_t base = + std::max( + std::min(cache_size / kBaseGPUMemCacheSize, 4), 1); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = + std::min(std::min(gws[0], base), kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + RoundUp( + cache_size / kernel_cache_size / lws_size / compute_units, base), + gws[2]); + if (lws[2] == 0) { + lws[2] = std::min(gws[2], base); + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); return lws; } @@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, std::string tuning_key = Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index c3208eca..a6e29694 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -30,30 +30,35 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kernel_size, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); - const uint32_t base = - std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = gws[0] / 4; - if (lws[0] == 0) { - lws[0] = gws[0]; - } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::min((cache_size / kernel_cache_size / kernel_size / - lws_size / compute_units) * - 8, - gws[2]); - if (lws[2] == 0) { - if (gws[2] < lws_limit) { - lws[2] = gws[2]; - } else { - lws[2] = base; + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + const uint32_t base = + std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = gws[0] / 4; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min((cache_size / kernel_cache_size / kernel_size / + lws_size / compute_units) * + 8, + gws[2]); + if (lws[2] == 0) { + if (gws[2] < lws_limit) { + lws[2] = gws[2]; + } else { + lws[2] = base; + } } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); return lws; } @@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, output->dim(2), output->dim(3), filter->dim(2), filter->dim(3)); std::vector lws = LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index 297b8dd3..ba9248d1 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -24,12 +24,18 @@ namespace kernels { namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[0] = std::min(base, kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::max(std::min(base, kwg_size / lws_size), 1); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = std::min(base, kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = + std::max(std::min(base, kwg_size / lws_size), 1); + } return lws; } @@ -147,7 +153,8 @@ MaceStatus CropFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("crop", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -181,7 +188,8 @@ MaceStatus CropFunctor::operator()( std::string tuning_key = Concat("crop_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index e40ac3b3..ac7af70d 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, std::string tuning_key = Concat("deconv2d_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index bf339f40..cd379b22 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name, - built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space", + obfuscated_kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( } const std::vector lws = Default3DLocalWS(gws, kwg_size_); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index ca44be2f..517ff16d 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -26,27 +26,33 @@ namespace { const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; - lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= base) { - lws[0] = std::min(gws[0], base); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; } else { - lws[0] = std::min(gws[0] / 8, kwg_size / lws[1]); - if (lws[0] < base) { - lws[0] = std::min(std::max(gws[0] / 4, base), - kwg_size / lws[1]); + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = std::min(gws[0] / 8, kwg_size / lws[1]); + if (lws[0] < base) { + lws[0] = std::min(std::max(gws[0] / 4, base), + kwg_size / lws[1]); + } } + lws[0] = + std::max(std::min(lws[0], kwg_size / lws[1]), 1); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min((cache_size / kernel_cache_size / lws_size) * 4, + gws[2]); + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); } - lws[0] = std::max(std::min(lws[0], kwg_size / lws[1]), 1); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::min((cache_size / kernel_cache_size / lws_size) * 4, - gws[2]); - if (lws[2] == 0) { - lws[2] = gws[2]; - } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); return lws; } @@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = - runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("depthwise_conv2d", kernel_name, + built_options, kernel)); *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, const std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); - TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index c0a74c42..503d5d5d 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, std::string tuning_key = Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index 7888287a..cbd046b4 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - *kernel = - runtime->BuildKernel("fully_connected", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name, + built_options, kernel)); if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); @@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; (*kernel_error)->UnMap(); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { @@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, default: LOG(FATAL) << "Unknown activation type: " << activation; } - *kernel = - runtime->BuildKernel("fully_connected", kernel_name, built_options); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("fully_connected", kernel_name, + built_options, kernel)); uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); @@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, std::string tuning_key = Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, + gws->data(), *lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 009bf276..6d882352 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -245,23 +245,27 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { std::vector Default3DLocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = - OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[2] = - std::min(std::min(gws[2], base), kwg_size / lws[1]); - const uint32_t lws_size = lws[1] * lws[2]; - lws[0] = std::max(std::min(base, kwg_size / lws_size), - 1); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[2] = + std::min(std::min(gws[2], base), kwg_size / lws[1]); + const uint32_t lws_size = lws[1] * lws[2]; + lws[0] = std::max(std::min(base, kwg_size / lws_size), + 1); + } return lws; } -void TuningOrRun3DKernel(const cl::Kernel &kernel, - const std::string tuning_key, - const uint32_t *gws, - const std::vector &lws, - StatsFuture *future) { +MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + const std::vector &lws, + StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); auto params_generator = [&]() -> std::vector> { @@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, std::vector internal_gws(gws, gws + 3); if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 3; ++i) { + MACE_CHECK(params[i] != 0); internal_gws[i] = RoundUp(gws[i], params[i]); } } @@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(internal_gws[0], internal_gws[1], gws2), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); } } else { timer->ClearTiming(); @@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(internal_gws[0], internal_gws[1], gws2), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); timer->AccumulateTiming(); } } @@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, return error; }; OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( + cl_int err = Tuner::Get()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); + MACE_CL_RET_STATUS(err); if (future != nullptr) { future->wait_fn = [event](CallStats *stats) { @@ -388,13 +394,14 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, } }; } + return MaceStatus::MACE_SUCCESS; } -void TuningOrRun2DKernel(const cl::Kernel &kernel, - const std::string tuning_key, - const uint32_t *gws, - const std::vector &lws, - StatsFuture *future) { +MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + const std::vector &lws, + StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); auto params_generator = [&]() -> std::vector> { @@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, std::vector internal_gws(gws, gws + 2); if (!runtime->IsNonUniformWorkgroupsSupported()) { for (size_t i = 0; i < 2; ++i) { + MACE_CHECK(params[i] != 0); internal_gws[i] = RoundUp(gws[i], params[i]); } } @@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(internal_gws[0], gws1), cl::NDRange(params[0], params[1]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); } } else { timer->ClearTiming(); error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]), cl::NDRange(params[0], params[1]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); timer->AccumulateTiming(); tuning_result->assign(params.begin(), params.end()); @@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, kernel, cl::NDRange(0, i * block_size), cl::NDRange(internal_gws[0], gws1), cl::NDRange(params[0], params[1]), nullptr, &event); - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_ERROR(error); timer->AccumulateTiming(); } } @@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, return error; }; OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( + cl_int err = Tuner::Get()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); + MACE_CL_RET_STATUS(err); + if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); @@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, } }; } + return MaceStatus::MACE_SUCCESS; } } // namespace kernels diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index d1e26002..5db95e3d 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -65,17 +65,17 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt); -void TuningOrRun3DKernel(const cl::Kernel &kernel, - const std::string tuning_key, - const uint32_t *gws, - const std::vector &lws, - StatsFuture *future); - -void TuningOrRun2DKernel(const cl::Kernel &kernel, - const std::string tuning_key, - const uint32_t *gws, - const std::vector &lws, - StatsFuture *future); +MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + const std::vector &lws, + StatsFuture *future); + +MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + const std::vector &lws, + StatsFuture *future); inline void SetFuture(StatsFuture *future, const cl::Event &event) { if (future != nullptr) { diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index 18f92b67..c8635e1a 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor::operator()( kernel_error_->UnMap(); } } - - auto b2f_kernel = runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, built_options); + cl::Kernel b2f_kernel; + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, + &b2f_kernel)); uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor::operator()( b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 444dcd29..f9825eb6 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -74,7 +74,8 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -99,7 +100,8 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, const std::vector lws = {kwg_size_ / 64, 64, 0}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); - TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index 71907c24..2497a4df 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer, kernel_error->UnMap(); } - auto b2f_kernel = runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, built_options); + cl::Kernel b2f_kernel; + + cl_int error = runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, &b2f_kernel); + if (error != CL_SUCCESS) { + return false; + } uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer, const std::vector lws = {16, kwg_size / 16}; cl::Event event; - cl_int error; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), @@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer, b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + if (error != CL_SUCCESS) { + return false; + } runtime->command_queue().finish(); bool is_out_of_range = false; diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 963f25e7..1a8879e8 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -68,7 +68,8 @@ MaceStatus PadFunctor::operator()(const Tensor *input, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("pad", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -104,7 +105,8 @@ MaceStatus PadFunctor::operator()(const Tensor *input, const std::vector lws = Default3DLocalWS(gws, kwg_size_); std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 09bc8d90..405c05a1 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -25,18 +25,23 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - lws[2] = - std::min(std::min(gws[2], base), kwg_size / lws[1]); - const uint32_t lws_size = lws[1] * lws[2]; - lws[0] = gws[0] / 4; - if (lws[0] == 0) { - lws[0] = gws[0]; + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; + } else { + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + lws[2] = + std::min(std::min(gws[2], base), kwg_size / lws[1]); + const uint32_t lws_size = lws[1] * lws[2]; + lws[0] = gws[0] / 4; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + lws[0] = std::max(std::min(lws[0], kwg_size / lws_size), + 1); } - lws[0] = std::max(std::min(lws[0], kwg_size / lws_size), - 1); return lws; } @@ -80,7 +85,10 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling", + kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -160,7 +168,8 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, std::string tuning_key = Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws.data(), lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/reduce_mean_opencl.cc b/mace/kernels/opencl/reduce_mean_opencl.cc index cd27ad1c..266a1111 100644 --- a/mace/kernels/opencl/reduce_mean_opencl.cc +++ b/mace/kernels/opencl/reduce_mean_opencl.cc @@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor::operator()( *(kernel_error_->mutable_data()) = 0; kernel_error_->UnMap(); } - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("reduce_mean", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean", + kernel_name, + built_options, + &kernel_)); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { @@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor::operator()( cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; kernel_error_->UnMap(); } - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (future != nullptr) { future->wait_fn = [runtime, event](CallStats *stats) { diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index ee823116..c5b8b65b 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -25,25 +25,30 @@ namespace kernels { namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); - lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= base) { - lws[0] = std::min(gws[0], base); + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; } else { - lws[0] = gws[0] / 8; - if (lws[0] == 0) { - lws[0] = gws[0]; + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] == 0) { + lws[0] = gws[0]; + } } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = gws[2] / 8; + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = gws[2] / 8; - if (lws[2] == 0) { - lws[2] = gws[2]; - } - lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), - 1); return lws; } @@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = - runtime->BuildKernel("resize_bilinear", kernel_name, built_options); + MACE_RETURN_IF_ERROR( + runtime->BuildKernel("resize_bilinear", + kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor::operator()( std::string tuning_key = Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 160ad003..726bcae9 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -61,7 +61,10 @@ MaceStatus SliceFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice", + kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -107,7 +110,7 @@ MaceStatus SliceFunctor::operator()( cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); } - MACE_CHECK_CL_SUCCESS(error); + MACE_CL_RET_STATUS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 38d0b8bb..07855488 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -25,19 +25,23 @@ namespace kernels { namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); std::vector lws(4, 0); - lws[1] = std::min(gws[1], kwg_size); - if (gws[0] < base) { - lws[0] = gws[0]; + if (kwg_size == 0) { + lws[0] = lws[1] = lws[2] = 1; } else { - lws[0] = gws[0] / base; + uint64_t + cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); + lws[1] = std::min(gws[1], kwg_size); + if (gws[0] < base) { + lws[0] = gws[0]; + } else { + lws[0] = gws[0] / base; + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + lws[2] = std::max(std::min( + gws[2], kwg_size / (lws[0] * lws[1])), 1); } - lws[0] = std::min(lws[0], kwg_size / lws[1]); - lws[2] = std::max(std::min(gws[2], - kwg_size / (lws[0] * lws[1])), - 1); return lws; } @@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name, + built_options, &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, std::vector lws = LocalWS(gws, kwg_size_); std::string tuning_key = Concat("softmax_opencl_kernel", batch, height, width, channels); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index b558ba7d..3606d91d 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name, - built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch", + obfuscated_kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor::operator()( std::string tuning_key = Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), batch_tensor->dim(2), batch_tensor->dim(3)); - TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 790ab181..6cc8e08c 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor::operator()( if (runtime->IsNonUniformWorkgroupsSupported()) { built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); } - kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, - built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor::operator()( output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2)); - TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); @@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor::operator()( LOG(FATAL) << "Unknown activation type: " << activation_; } - kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, - built_options); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + &kernel_)); kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); @@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor::operator()( Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3), input_tensor->dim(2)); - TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + gws, lws, future)); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 03731078..470d082f 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data, MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", strerror(errno)); } + +#ifdef MACE_ENABLE_OPENCL +MaceStatus CheckGPUAvalibility(const NetDef *net_def) { + // Check OpenCL avaliable + auto runtime = OpenCLRuntime::Global(); + if (!runtime->is_opencl_avaliable()) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } + + // Check whether model max OpenCL image sizes exceed OpenCL limitation. + if (net_def == nullptr) { + return MaceStatus::MACE_INVALID_ARGS; + } + + if (!runtime->IsImageSupport()) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } + + auto opencl_max_image_size = runtime->GetMaxImage2DSize(); + if (opencl_max_image_size.empty()) { + return MaceStatus::MACE_OUT_OF_RESOURCES; + } + + const std::vector net_max_image_size = + ProtoArgHelper::GetRepeatedArgs( + *net_def, "opencl_max_image_size", {0, 0}); + + if (static_cast(net_max_image_size[0]) > opencl_max_image_size[0] + || static_cast(net_max_image_size[1]) + > opencl_max_image_size[1]) { + LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size) + << " vs " << MakeString(net_max_image_size); + return MaceStatus::MACE_OUT_OF_RESOURCES; + } + return MaceStatus::MACE_SUCCESS; +} +#endif + } // namespace // Mace Tensor @@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init( const std::vector &output_nodes, const unsigned char *model_data) { LOG(INFO) << "Initializing MaceEngine"; + // Check avalibility +#ifdef MACE_ENABLE_OPENCL + if (device_type_ == DeviceType::GPU) { + MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def)); + } +#endif // Get input and output information. for (auto &input_info : net_def->input_info()) { input_info_map_[input_info.name()] = input_info; diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 08ad648e..08d862ea 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -20,6 +20,12 @@ enum DataType { DT_INT32 = 4; } +enum MemoryType { + CPU_BUFFER = 0; + GPU_BUFFER = 1; + GPU_IMAGE = 2; +} + message ConstTensor { repeated int64 dims = 1; optional DataType data_type = 2 [default = DT_FLOAT]; @@ -73,8 +79,9 @@ message OperatorDef { // for memory optimization message MemoryBlock { optional int32 mem_id = 1; - optional uint32 x = 2; - optional uint32 y = 3; + optional MemoryType mem_type = 2; + optional uint32 x = 3; + optional uint32 y = 4; } message MemoryArena { repeated MemoryBlock mem_block = 1; diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index 6fac6ec5..13ee9a52 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory std::unique_ptr impl_; }; -// Set Key-Value store factory. (Call Once) -// Now KVStorage is used to store the built OpenCL binaries to file, -// which could speed up the GPU initialization and first run. -// If do not call this API, the initialization maybe slow for GPU. +/// \brief Set internal storage factory to store internal data. (Call once) +/// +/// Now the path is used to store the built OpenCL binaries to file, +/// which could speed up the GPU initialization and first run. +/// If do not call this API, the initialization maybe slow for GPU. +/// +/// \param path Make sure your program have Read/Write permission of the path +/// \return __attribute__((visibility("default"))) void SetKVStorageFactory(std::shared_ptr storage_factory); -// Just call once. (Not thread-safe) -// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) -// if you use gpu of specific soc. -// Using OpenCL binary will speed up the initialization. -// OpenCL binary is corresponding to the OpenCL Driver version, -// you should update the binary when OpenCL Driver changed. +/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length) +/// +/// Just call once. (Not thread-safe) +/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length) +/// OpenCL binary is corresponding to the OpenCL Driver version, +/// you should update the binary when OpenCL Driver changed. +/// +/// \param paths MACE will use first file found in all paths +/// \return __attribute__((visibility("default"))) void SetOpenCLBinaryPaths(const std::vector &paths); -// Just call once. (Not thread-safe) -// Set the path of Generated OpenCL parameter file -// if you use gpu for specific soc. -// The parameters is the local work group size tuned for specific SOC, which -// may be faster than the general parameters. +/// \brief Set the path of Generated OpenCL parameter file +/// +/// Just call once. (Not thread-safe) +/// If you use gpu for specific soc, The parameters is the local work group +/// size tuned for specific SOC, which may be faster than the +/// general parameters. +/// +/// \param path Make sure your program have Read/Write permission of the path +/// \return __attribute__((visibility("default"))) void SetOpenCLParameterPath(const std::string &path); -// Set GPU hints, currently only supports Adreno GPU. -// -// Caution: this function may hurt performance if improper parameters provided. +/// \brief Set GPU hints, currently only supports Adreno GPU. +/// +/// Caution: this function may hurt performance +/// if improper parameters provided. +/// +/// \param perf_hint performance hint +/// \param priority_hint priority hint +/// \return __attribute__((visibility("default"))) void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); -// Set OpenMP threads number and affinity policy. -// -// Caution: this function may hurt performance if improper parameters provided. -// -// num_threads_hint is only a hint. When num_threads_hint is zero or negative, -// the function will set the threads number equaling to the number of -// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all -// (AFFINITY_NONE) cores according to the policy. The threads number will -// also be truncated to the corresponding cores number when num_threads_hint -// is larger than it. -// -// The OpenMP threads will be bind to (via sched_setaffinity) big cores -// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). -// -// If successful, it returns MACE_SUCCESS and error if it can't reliabley -// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's -// suggested to use AFFINITY_NONE to use all cores. +/// \brief Set OpenMP threads number and affinity policy. +/// +/// Caution: this function may hurt performance if improper parameters provided. +/// When num_threads_hint is zero or negative, +/// the function will set the threads number equaling to the number of +/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all +/// (AFFINITY_NONE) cores according to the policy. The threads number will +/// also be truncated to the corresponding cores number when num_threads_hint +/// is larger than it. +/// The OpenMP threads will be bind to (via sched_setaffinity) big cores +/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). +/// +/// \param num_threads_hint it is only a hint. +/// \param policy one of CPUAffinityPolicy +/// \param status MACE_SUCCESS for successful, or it can't reliabley +/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's +/// suggested to use AFFINITY_NONE to use all cores. +/// \return __attribute__((visibility("default"))) MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, CPUAffinityPolicy policy); -// Set OpenMP threads number and processor affinity. -// -// Caution: this function may hurt performance if improper parameters provided. -// -// This function may not work well on some chips (e.g. MTK). Setting thread -// affinity to offline cores may run very slow or unexpectedly. In such cases, -// please use SetOpenMPThreadPolicy with default policy instead. +/// \brief Set OpenMP threads number and processor affinity. +/// +/// Caution: this function may hurt performance +/// if improper parameters provided. +/// This function may not work well on some chips (e.g. MTK). Setting thread +/// affinity to offline cores may run very slow or unexpectedly. +/// In such cases, please use SetOpenMPThreadPolicy with default policy +/// instead. +/// +/// \param num_threads +/// \param cpu_ids +/// \param status +/// \return __attribute__((visibility("default"))) MaceStatus SetOpenMPThreadAffinity(int num_threads, const std::vector &cpu_ids); -// Get ARM big.LITTLE configuration. -// -// This function will detect the max frequencies of all CPU cores, and assume -// the cores with largest max frequencies as big cores, and all the remaining -// cores as little. If all cpu core's max frequencies equals, big_core_ids and -// little_core_ids will both be filled with all cpu core ids. -// -// If successful, it returns MACE_SUCCESS and error if it can't reliabley -// detect the frequency of big-LITTLE cores (e.g. MTK). +/// \brief Get ARM big.LITTLE configuration. +/// +/// This function will detect the max frequencies of all CPU cores, and assume +/// the cores with largest max frequencies as big cores, and all the remaining +/// cores as little. If all cpu core's max frequencies equals, big_core_ids and +/// little_core_ids will both be filled with all cpu core ids. +/// +/// \param [out] big_core_ids +/// \param [out] little_core_ids +/// \return If successful, it returns MACE_SUCCESS and error if it can't +/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK). __attribute__((visibility("default"))) MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, std::vector *little_core_ids); diff --git a/mace/python/tools/convert_util.py b/mace/python/tools/convert_util.py index 18791987..31590913 100644 --- a/mace/python/tools/convert_util.py +++ b/mace/python/tools/convert_util.py @@ -12,7 +12,72 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum + def mace_check(condition, msg): if not condition: raise Exception(msg) + + +def roundup_div4(value): + return int((value + 3) / 4) + + +class OpenCLBufferType(enum.Enum): + CONV2D_FILTER = 0 + IN_OUT_CHANNEL = 1 + ARGUMENT = 2 + IN_OUT_HEIGHT = 3 + IN_OUT_WIDTH = 4 + WINOGRAD_FILTER = 5 + DW_CONV2D_FILTER = 6 + WEIGHT_HEIGHT = 7 + WEIGHT_WIDTH = 8 + + +def calculate_image_shape(buffer_type, shape, winograd_blk_size=0): + # keep the same with mace/kernel/opencl/helper.cc + image_shape = [0, 0] + if buffer_type == OpenCLBufferType.CONV2D_FILTER: + mace_check(len(shape) == 4, "Conv2D filter buffer should be 4D") + image_shape[0] = shape[1] + image_shape[1] = shape[2] * shape[3] * roundup_div4(shape[0]) + elif buffer_type == OpenCLBufferType.IN_OUT_CHANNEL: + mace_check(len(shape) == 4, "Conv2D input/output buffer should be 4D") + image_shape[0] = roundup_div4(shape[3]) * shape[2] + image_shape[1] = shape[0] * shape[1] + elif buffer_type == OpenCLBufferType.ARGUMENT: + mace_check(len(shape) == 1, + "Argument buffer should be 1D not " + str(shape)) + image_shape[0] = roundup_div4(shape[0]) + image_shape[1] = 1 + elif buffer_type == OpenCLBufferType.IN_OUT_HEIGHT: + mace_check(len(shape) == 4, "Input/output buffer should be 4D") + image_shape[0] = shape[2] * shape[3] + image_shape[1] = shape[0] * roundup_div4(shape[1]) + elif buffer_type == OpenCLBufferType.IN_OUT_WIDTH: + mace_check(len(shape) == 4, "Input/output buffer should be 4D") + image_shape[0] = roundup_div4(shape[2]) * shape[3] + image_shape[1] = shape[0] * shape[1] + elif buffer_type == OpenCLBufferType.WINOGRAD_FILTER: + mace_check(len(shape) == 4, "Winograd filter buffer should be 4D") + image_shape[0] = roundup_div4(shape[1]) + image_shape[1] = (shape[0] * (winograd_blk_size + 2) + * (winograd_blk_size + 2)) + elif buffer_type == OpenCLBufferType.DW_CONV2D_FILTER: + mace_check(len(shape) == 4, "Winograd filter buffer should be 4D") + image_shape[0] = shape[0] * shape[2] * shape[3] + image_shape[1] = roundup_div4(shape[1]) + elif buffer_type == OpenCLBufferType.WEIGHT_HEIGHT: + mace_check(len(shape) == 4, "Weight buffer should be 4D") + image_shape[0] = shape[1] * shape[2] * shape[3] + image_shape[1] = roundup_div4(shape[0]) + elif buffer_type == OpenCLBufferType.WEIGHT_WIDTH: + mace_check(len(shape) == 4, "Weight buffer should be 4D") + image_shape[0] = roundup_div4(shape[1]) * shape[2] * shape[3] + image_shape[1] = shape[0] + else: + mace_check(False, "OpenCL Image do not support type " + + str(buffer_type)) + return image_shape diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 2a069c1e..3fd856c7 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -171,6 +171,13 @@ def main(unused_args): output_graph_def.op.extend(cpu_graph_def.op) output_graph_def.mem_arena.mem_block.extend( cpu_graph_def.mem_arena.mem_block) + output_graph_arg_names = set() + for arg in output_graph_def.arg: + output_graph_arg_names.add(arg.name) + + for arg in cpu_graph_def.arg: + if arg.name not in output_graph_arg_names: + output_graph_def.arg.extend(arg) print "Merge done" else: option.device = device_type_map[FLAGS.runtime] diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 46b6247e..267fafeb 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -163,6 +163,7 @@ class MaceKeyword(object): mace_op_data_type_str = 'T' mace_offset_str = 'offset' mace_from_caffe_str = 'from_caffe' + mace_opencl_max_image_size = "opencl_max_image_size" class TransformerRule(Enum): diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 850519f5..da8ecff1 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import TransformerRule +from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import mace_check - -OPENCL_IMAGE_MAX_SIZE = 16384 +from mace.python.tools.convert_util import OpenCLBufferType -class OpenCLBufferType(enum.Enum): - CONV2D_FILTER = 0 - IN_OUT_CHANNEL = 1 - ARGUMENT = 2 - IN_OUT_HEIGHT = 3 - IN_OUT_WIDTH = 4 - WINOGRAD_FILTER = 5 - DW_CONV2D_FILTER = 6 - WEIGHT_HEIGHT = 7 - WEIGHT_WIDTH = 8 +OPENCL_IMAGE_MAX_SIZE = 16384 class Transformer(base_converter.ConverterInterface): @@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface): self._producer = {} self._target_data_format = DataFormat.NHWC self._input_output_added = False + self._opencl_max_image_size = [0, 0] if self._option.device == DeviceType.CPU.value: self._target_data_format = DataFormat.NCHW @@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface): arg.name = MaceKeyword.mace_mode arg.i = 0 + tensor_shape = list(self._consts[input_name].dims) if input_type == OpenCLBufferType.WINOGRAD_FILTER: blk_sqr = op.output_shape[0].dims[0] wino_blk = int(np.sqrt(blk_sqr)) - 2 wino_arg = op_def.arg.add() wino_arg.name = MaceKeyword.mace_wino_block_size wino_arg.i = wino_blk + img_shape = calculate_image_shape(input_type, tensor_shape, + wino_blk) + else: + img_shape = calculate_image_shape(input_type, tensor_shape) op.input[input_idx] = output_name + # update OpenCL max image size + self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0], + img_shape[0]) + self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1], + img_shape[1]) + def transform_buffer_image(self): if self._option.device != DeviceType.GPU.value: return False @@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface): MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) + # Add OpenCL max image size + arg = net.arg.add() + arg.name = MaceKeyword.mace_opencl_max_image_size + arg.ints.extend(self._opencl_max_image_size) + for input_node in self._option.input_nodes.values(): new_input_name = MaceKeyword.mace_input_node_name \ + '_' + input_node.name diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index 5b1d3d34..0c18a66b 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -16,6 +16,24 @@ import sys import operator from mace.proto import mace_pb2 +from mace.python.tools.converter_tool import base_converter as cvt +from mace.python.tools.convert_util import calculate_image_shape +from mace.python.tools.convert_util import OpenCLBufferType + + +class MemoryBlock(object): + def __init__(self, mem_type, block): + self._mem_type = mem_type + self._block = block + + @property + def mem_type(self): + return self._mem_type + + @property + def block(self): + return self._block + class MemoryOptimizer(object): def __init__(self, net_def): @@ -24,7 +42,6 @@ class MemoryOptimizer(object): self.op_mem = {} # op_name->mem_id self.mem_block = {} # mem_id->[size] or mem_id->[x, y] self.total_mem_count = 0 - self.total_cpu_mem_count = 0 self.input_ref_counter = {} self.mem_ref_counter = {} @@ -52,23 +69,27 @@ class MemoryOptimizer(object): return True def get_op_mem_block(self, op_type, output_shape): - return [reduce(operator.mul, output_shape, 1)] + return MemoryBlock(mace_pb2.CPU_BUFFER, + [reduce(operator.mul, output_shape, 1)]) def mem_size(self, memory_block): - return memory_block[0] + return memory_block.block[0] def sub_mem_block(self, mem_block1, mem_block2): return self.mem_size(mem_block1) - self.mem_size(mem_block2) def resize_mem_block(self, old_mem_block, op_mem_block): - return [max(old_mem_block[0], op_mem_block[0])] + return MemoryBlock( + old_mem_block.mem_type, + [max(old_mem_block.block[0], op_mem_block.block[0])]) def add_net_mem_blocks(self): for mem in self.mem_block: arena = self.net_def.mem_arena block = arena.mem_block.add() block.mem_id = mem - block.x = self.mem_block[mem][0] + block.mem_type = self.mem_block[mem].mem_type + block.x = self.mem_block[mem].block[0] block.y = 1 def get_total_origin_mem_size(self): @@ -82,7 +103,7 @@ class MemoryOptimizer(object): def get_total_optimized_mem_size(self): optimized_mem_size = 0 for mem in self.mem_block: - print mem, self.mem_block[mem] + print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block optimized_mem_size += self.mem_size(self.mem_block[mem]) return optimized_mem_size @@ -117,6 +138,8 @@ class MemoryOptimizer(object): best_mem_waste_size = sys.maxint for mid in self.idle_mem: old_mem_block = self.mem_block[mid] + if old_mem_block.mem_type != op_mem_block.mem_type: + continue new_mem_block = self.resize_mem_block( old_mem_block, op_mem_block) add_mem_size = self.sub_mem_block(new_mem_block, @@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer): for arg in op.arg: if arg.name == 'mode' and arg.i == 0: return False - elif op.type == 'Shape': - for i in range(len(op.output)): - mem_id = self.total_cpu_mem_count - self.total_cpu_mem_count += 1 - op_mem_block = self.get_op_mem_block( - op.type, - op.output_shape[i].dims) - self.mem_block[mem_id] = op_mem_block - return False return op.type != 'ImageToBuffer' def get_op_mem_block(self, op_type, output_shape): - mem_block = [0, 0] if op_type == 'WinogradTransform' or op_type == 'MatMul': - mem_block[0] = output_shape[2] - mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) + buffer_shape = list(output_shape) + [1] + mem_block = MemoryBlock( + mace_pb2.GPU_IMAGE, + calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT, + buffer_shape)) elif op_type == 'Shape': - mem_block[0] = output_shape[0] - mem_block[1] = 1 + mem_block = MemoryBlock(mace_pb2.CPU_BUFFER, + [output_shape[0], 1]) else: if len(output_shape) == 2: # only support fc/softmax - mem_block[0] = int((output_shape[1] + 3) / 4) - mem_block[1] = output_shape[0] + buffer_shape = [output_shape[0], 1, 1, output_shape[1]] elif len(output_shape) == 4: - mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) - mem_block[1] = output_shape[0] * output_shape[1] + buffer_shape = output_shape else: raise Exception('output shape dim size is not 2 or 4.') + mem_block = MemoryBlock( + mace_pb2.GPU_IMAGE, + calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL, + buffer_shape)) return mem_block def mem_size(self, memory_block): - return memory_block[0] * memory_block[1] * 4 + if memory_block.mem_type == mace_pb2.GPU_IMAGE: + return memory_block.block[0] * memory_block.block[1] * 4 + else: + return memory_block.block[0] def resize_mem_block(self, old_mem_block, op_mem_block): - resize_mem_block = [ - max(old_mem_block[0], op_mem_block[0]), - max(old_mem_block[1], op_mem_block[1]) - ] + resize_mem_block = MemoryBlock( + old_mem_block.mem_type, + [ + max(old_mem_block.block[0], op_mem_block.block[0]), + max(old_mem_block.block[1], op_mem_block.block[1]) + ]) + return resize_mem_block def add_net_mem_blocks(self): + max_image_size_x = 0 + max_image_size_y = 0 for mem in self.mem_block: arena = self.net_def.mem_arena block = arena.mem_block.add() block.mem_id = mem - block.x = self.mem_block[mem][0] - block.y = self.mem_block[mem][1] + block.mem_type = self.mem_block[mem].mem_type + block.x = self.mem_block[mem].block[0] + block.y = self.mem_block[mem].block[1] + if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE: + max_image_size_x = max(max_image_size_x, block.x) + max_image_size_y = max(max_image_size_y, block.y) + + # Update OpenCL max image size + net_ocl_max_img_size_arg = None + for arg in self.net_def.arg: + if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size: + net_ocl_max_img_size_arg = arg + max_image_size_x = max(arg.ints[0], max_image_size_x) + max_image_size_y = max(arg.ints[1], max_image_size_y) + break + if net_ocl_max_img_size_arg is None: + net_ocl_max_img_size_arg = self.net_def.arg.add() + net_ocl_max_img_size_arg.name = \ + cvt.MaceKeyword.mace_opencl_max_image_size + + net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, + max_image_size_y] def mem_id_base(self): return 20000 diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 2d2ad8ec..267911b9 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) { mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block(); mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}}); + mem_block{{i}}->set_mem_type(static_cast({{net.mem_arena.mem_block[i].mem_type}})); mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}}); mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}}); diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index b3a09a4d..2280ae35 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -244,6 +244,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < input_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_y(in_mem_block_y); res[input_names[i]] = mem_id; @@ -263,6 +264,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < output_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_y(out_mem_block_y); res[output_names[i]] = mem_id; diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 874e221e..9f929de6 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -245,6 +245,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < input_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_y(in_mem_block_y); res[input_names[i]] = mem_id; @@ -264,6 +265,7 @@ std::map AddMemoryOptimization( for (size_t i = 0; i < output_size; ++i) { MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); mem_blk_ptr->set_mem_id(mem_id); + mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_y(out_mem_block_y); res[output_names[i]] = mem_id; -- GitLab