提交 88120708 编写于 作者: L liuqi

Add gpu avalibility check and return status to user if gpu call failed.

上级 d9a58a5e
...@@ -37,7 +37,7 @@ int FileStorage::Load() { ...@@ -37,7 +37,7 @@ int FileStorage::Load() {
struct stat st; struct stat st;
if (stat(file_path_.c_str(), &st) == -1) { if (stat(file_path_.c_str(), &st) == -1) {
if (errno == ENOENT) { if (errno == ENOENT) {
LOG(INFO) << "File " << file_path_ VLOG(1) << "File " << file_path_
<< " does not exist"; << " does not exist";
return 0; return 0;
} else { } else {
......
...@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { ...@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *mapped_ptr = void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
offset, nbytes, nullptr, nullptr, &error); offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK_CL_SUCCESS(error); if (error != CL_SUCCESS) {
LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
mapped_ptr = nullptr;
}
return mapped_ptr; return mapped_ptr;
} }
...@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer, ...@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error); nullptr, &error);
MACE_CHECK_CL_SUCCESS(error); if (error != CL_SUCCESS) {
LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
mapped_ptr = nullptr;
}
return mapped_ptr; return mapped_ptr;
} }
...@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { ...@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto queue = OpenCLRuntime::Global()->command_queue(); auto queue = OpenCLRuntime::Global()->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
nullptr, nullptr); nullptr, nullptr);
MACE_CHECK_CL_SUCCESS(error); if (error != CL_SUCCESS) {
LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error);
}
} }
bool OpenCLAllocator::OnHost() const { return false; } bool OpenCLAllocator::OnHost() const { return false; }
......
...@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath( ...@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
OpenCLRuntime::OpenCLRuntime(): OpenCLRuntime::OpenCLRuntime():
precompiled_binary_storage_(nullptr), precompiled_binary_storage_(nullptr),
cache_storage_(nullptr), cache_storage_(nullptr),
is_profiling_enabled_(false) { is_opencl_avaliable_(false),
is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN) {
std::vector<cl::Platform> all_platforms; std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms); cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) { if (all_platforms.size() == 0) {
LOG(FATAL) << "No OpenCL platforms found"; LOG(ERROR) << "No OpenCL platforms found";
return;
} }
cl::Platform default_platform = all_platforms[0]; cl::Platform default_platform = all_platforms[0];
std::stringstream ss; std::stringstream ss;
...@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
std::vector<cl::Device> all_devices; std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) { if (all_devices.size() == 0) {
LOG(FATAL) << "No OpenCL devices found"; LOG(ERROR) << "No OpenCL devices found";
return;
} }
bool gpu_detected = false; bool gpu_detected = false;
...@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
const std::string device_version = device.getInfo<CL_DEVICE_VERSION>(); const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
opencl_version_ = ParseDeviceVersion(device_version); opencl_version_ = ParseDeviceVersion(device_version);
if (opencl_version_ == OpenCLVersion::CL_VER_UNKNOWN) {
return;
}
VLOG(1) << "Using device: " << device_name; VLOG(1) << "Using device: " << device_name;
break; break;
} }
} }
if (!gpu_detected) { if (!gpu_detected) {
LOG(FATAL) << "No GPU device found"; LOG(ERROR) << "No GPU device found";
return;
} }
cl_command_queue_properties properties = 0; cl_command_queue_properties properties = 0;
...@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
new cl::Context({*device_}, nullptr, nullptr, nullptr, &err)); new cl::Context({*device_}, nullptr, nullptr, nullptr, &err));
} }
} }
MACE_CHECK_CL_SUCCESS(err); if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return;
}
command_queue_ = std::make_shared<cl::CommandQueue>(*context_, command_queue_ = std::make_shared<cl::CommandQueue>(*context_,
*device_, *device_,
properties, properties,
&err); &err);
MACE_CHECK_CL_SUCCESS(err); if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return;
}
extern std::shared_ptr<KVStorageFactory> kStorageFactory; extern std::shared_ptr<KVStorageFactory> kStorageFactory;
std::string cached_binary_platform_info; std::string cached_binary_platform_info;
...@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
} }
if (cached_binary_platform_info != platform_info_) { if (cached_binary_platform_info != platform_info_) {
if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
LOG(WARNING) << "There is no precompiled OpenCL binary in"
" all OpenCL binary paths";
} else {
precompiled_binary_storage_.reset( precompiled_binary_storage_.reset(
new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
if (precompiled_binary_storage_->Load() != 0) { if (precompiled_binary_storage_->Load() != 0) {
...@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
} else { } else {
this->out_of_range_check_ = false; this->out_of_range_check_ = false;
} }
is_opencl_avaliable_ = true;
} }
OpenCLRuntime::~OpenCLRuntime() { OpenCLRuntime::~OpenCLRuntime() {
...@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() { ...@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
device_.reset(); device_.reset();
} }
bool OpenCLRuntime::is_opencl_avaliable() {
static const uint64_t kMinWorkGroupSize = 64;
return is_opencl_avaliable_
&& GetDeviceMaxWorkGroupSize() >= kMinWorkGroupSize;
}
cl::Context &OpenCLRuntime::context() { return *context_; } cl::Context &OpenCLRuntime::context() { return *context_; }
cl::Device &OpenCLRuntime::device() { return *device_; } cl::Device &OpenCLRuntime::device() { return *device_; }
...@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary( ...@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true; return true;
} }
void OpenCLRuntime::BuildProgramFromSource( bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name, const std::string &program_name,
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
...@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource( ...@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
LOG(WARNING) << "Build program " LOG(WARNING) << "Build program "
<< program_name << " from source failed: " << program_name << " from source failed: "
<< MakeString(ret); << MakeString(ret);
return; return false;
} }
// Keep built program binary // Keep built program binary
...@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource( ...@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES, cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * device_list_size, sizeof(size_t) * device_list_size,
program_binary_sizes.get(), nullptr); program_binary_sizes.get(), nullptr);
MACE_CHECK_CL_SUCCESS(err); if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries( std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[device_list_size]); new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) { for (cl_uint i = 0; i < device_list_size; ++i) {
...@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource( ...@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES, err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
sizeof(unsigned char *) * device_list_size, sizeof(unsigned char *) * device_list_size,
program_binaries.get(), nullptr); program_binaries.get(), nullptr);
MACE_CHECK_CL_SUCCESS(err); if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
std::vector<unsigned char> content( std::vector<unsigned char> content(
reinterpret_cast<unsigned char const *>(program_binaries[0].get()), reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) + reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
...@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource( ...@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
VLOG(3) << "Program from source: " << built_program_key; VLOG(3) << "Program from source: " << built_program_key;
} }
return true;
} }
void OpenCLRuntime::BuildProgram(const std::string &program_name, bool OpenCLRuntime::BuildProgram(const std::string &program_name,
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options, const std::string &build_options,
cl::Program *program) { cl::Program *program) {
...@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name, ...@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
ret = BuildProgramFromPrecompiledBinary(built_program_key, ret = BuildProgramFromPrecompiledBinary(built_program_key,
build_options_str, program); build_options_str, program);
if (!ret) { if (!ret) {
BuildProgramFromSource(program_name, built_program_key, ret = BuildProgramFromSource(program_name, built_program_key,
build_options_str, program); build_options_str, program);
} }
} }
return ret;
} }
cl::Kernel OpenCLRuntime::BuildKernel( MaceStatus OpenCLRuntime::BuildKernel(
const std::string &program_name, const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options) { const std::set<std::string> &build_options,
cl::Kernel *kernel) {
std::string build_options_str; std::string build_options_str;
for (auto &option : build_options) { for (auto &option : build_options) {
build_options_str += " " + option; build_options_str += " " + option;
...@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel( ...@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) { if (built_program_it != built_program_map_.end()) {
program = built_program_it->second; program = built_program_it->second;
} else { } else {
this->BuildProgram(program_name, built_program_key, build_options_str, bool ret = this->BuildProgram(program_name, built_program_key,
&program); build_options_str, &program);
if (!ret) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
built_program_map_.emplace(built_program_key, program); built_program_map_.emplace(built_program_key, program);
} }
return cl::Kernel(program, kernel_name.c_str()); cl_int err;
*kernel = cl::Kernel(program, kernel_name.c_str(), &err);
MACE_CL_RET_STATUS(err);
return MaceStatus::MACE_SUCCESS;
} }
void OpenCLRuntime::SaveBuiltCLProgram() { void OpenCLRuntime::SaveBuiltCLProgram() {
...@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) { ...@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
uint64_t size = 0; uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); cl_int err = device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size; return size;
} }
uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() { uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
uint64_t size = 0; uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); cl_int err = device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size; return size;
} }
bool OpenCLRuntime::IsImageSupport() {
cl_bool res;
cl_int err = device_->getInfo(CL_DEVICE_IMAGE_SUPPORT, &res);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
return res == CL_TRUE;
}
std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
size_t max_height, max_width;
cl_int err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {};
}
err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {};
}
return {max_height, max_width};
}
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
uint64_t size = 0; uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size); cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE,
&size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size; return size;
} }
uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
uint64_t size = 0; uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size); cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM,
&size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size; return size;
} }
...@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion( ...@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
} else if (words[1] == "1.0") { } else if (words[1] == "1.0") {
return OpenCLVersion::CL_VER_1_0; return OpenCLVersion::CL_VER_1_0;
} else { } else {
LOG(FATAL) << "Do not support OpenCL version: " << words[1]; LOG(ERROR) << "Do not support OpenCL version: " << words[1];
return OpenCLVersion::CL_VER_1_0; return OpenCLVersion::CL_VER_UNKNOWN;
} }
} }
......
...@@ -42,13 +42,23 @@ enum OpenCLVersion { ...@@ -42,13 +42,23 @@ enum OpenCLVersion {
CL_VER_1_1, CL_VER_1_1,
CL_VER_1_2, CL_VER_1_2,
CL_VER_2_0, CL_VER_2_0,
CL_VER_UNKNOWN,
}; };
const std::string OpenCLErrorToString(cl_int error); const std::string OpenCLErrorToString(cl_int error);
#define MACE_CHECK_CL_SUCCESS(error) \ #define MACE_CL_RET_ERROR(error) \
MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error) if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return error; \
}
#define MACE_CL_RET_STATUS(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
...@@ -81,19 +91,23 @@ class OpenCLRuntime { ...@@ -81,19 +91,23 @@ class OpenCLRuntime {
const std::string platform_info() const; const std::string platform_info() const;
uint64_t device_global_mem_cache_size() const; uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const; uint32_t device_compute_units() const;
bool is_opencl_avaliable();
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetDeviceMaxMemAllocSize(); uint64_t GetDeviceMaxMemAllocSize();
bool IsImageSupport();
std::vector<uint64_t> GetMaxImage2DSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
bool IsNonUniformWorkgroupsSupported() const; bool IsNonUniformWorkgroupsSupported() const;
bool IsOutOfRangeCheckEnabled() const; bool IsOutOfRangeCheckEnabled() const;
bool is_profiling_enabled() const; bool is_profiling_enabled() const;
cl::Kernel BuildKernel(const std::string &program_name, MaceStatus BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options,
cl::Kernel *kernel);
void SaveBuiltCLProgram(); void SaveBuiltCLProgram();
...@@ -103,7 +117,7 @@ class OpenCLRuntime { ...@@ -103,7 +117,7 @@ class OpenCLRuntime {
OpenCLRuntime(const OpenCLRuntime &) = delete; OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
void BuildProgram(const std::string &program_file_name, bool BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name, const std::string &binary_file_name,
const std::string &build_options, const std::string &build_options,
cl::Program *program); cl::Program *program);
...@@ -115,7 +129,7 @@ class OpenCLRuntime { ...@@ -115,7 +129,7 @@ class OpenCLRuntime {
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
cl::Program *program); cl::Program *program);
void BuildProgramFromSource( bool BuildProgramFromSource(
const std::string &program_name, const std::string &program_name,
const std::string &built_program_key, const std::string &built_program_key,
const std::string &build_options_str, const std::string &build_options_str,
...@@ -125,6 +139,7 @@ class OpenCLRuntime { ...@@ -125,6 +139,7 @@ class OpenCLRuntime {
private: private:
std::unique_ptr<KVStorage> precompiled_binary_storage_; std::unique_ptr<KVStorage> precompiled_binary_storage_;
std::unique_ptr<KVStorage> cache_storage_; std::unique_ptr<KVStorage> cache_storage_;
bool is_opencl_avaliable_;
bool is_profiling_enabled_; bool is_profiling_enabled_;
// All OpenCL object must be a pointer and manually deleted before unloading // All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library. // OpenCL library.
......
...@@ -204,28 +204,30 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -204,28 +204,30 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be // TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit // consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) { for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (device_type == DeviceType::GPU) { if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
// TODO(liuqi): refactor based on PB std::unique_ptr<BufferBase> tensor_buf(
if (mem_block.mem_id() >= 20000) { new Buffer(GetDeviceAllocator(DeviceType::CPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image()); new Image());
MACE_RETURN_IF_ERROR(image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype)); {mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
} else {
if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type))); new Buffer(GetDeviceAllocator(DeviceType::GPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() * GetEnumTypeSize(dtype)));
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
} }
}
VLOG(3) << "Preallocate buffer to tensors"; VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
// TODO(liuqi): refactor based on PB // TODO(liuqi): refactor based on PB
......
...@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
#endif #endif
if (create_engine_status != MaceStatus::MACE_SUCCESS) { if (create_engine_status != MaceStatus::MACE_SUCCESS) {
std::cerr << "Create engine error, please check the arguments" << std::endl; std::cerr << "Create engine error, please check the arguments first, "
<< "if correct, the device may not run the model, "
<< "please fall back to other strategy."
<< std::endl;
exit(1); exit(1);
} }
......
...@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
default: default:
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3)); output_tensor->dim(2), output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0), Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
} }
} }
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", cl::Kernel b2f_kernel;
obfuscated_kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name, built_options); runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -24,12 +24,18 @@ namespace kernels { ...@@ -24,12 +24,18 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1); lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws; return lws;
} }
...@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if (input0->dim(3) % 4 == 0) { if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1), Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
...@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
} }
...@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>(); char *kerror_code = (*kernel_error)->mutable_data<char>();
......
...@@ -27,7 +27,11 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; ...@@ -27,7 +27,11 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
const uint32_t lws_limit = 128; const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base = const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
...@@ -45,12 +49,14 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -45,12 +49,14 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>( lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]); (cache_size / kernel_cache_size / lws_size / compute_units) * 8,
gws[2]);
if (lws[2] == 0) { if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base); lws[2] = std::min<uint32_t>(gws[2], base);
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size), lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -26,7 +26,11 @@ namespace { ...@@ -26,7 +26,11 @@ namespace {
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>( uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1); OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base = const uint32_t base =
...@@ -45,6 +49,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -45,6 +49,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size), lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -30,7 +30,11 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -30,7 +30,11 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size, const uint32_t kernel_size,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base = const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
...@@ -54,6 +58,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -54,6 +58,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size), lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
output->dim(2), output->dim(3), filter->dim(2), filter->dim(3)); output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -24,12 +24,18 @@ namespace kernels { ...@@ -24,12 +24,18 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1); lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws; return lws;
} }
...@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("crop", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1), Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0), Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
built_options); obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -26,7 +26,11 @@ namespace { ...@@ -26,7 +26,11 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -38,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -38,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
kwg_size / lws[1]); kwg_size / lws[1]);
} }
} }
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1); lws[0] =
std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
const uint32_t lws_size = lws[0] * lws[1]; const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4, lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
gws[2]); gws[2]);
...@@ -47,6 +52,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -47,6 +52,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size), lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = MACE_RETURN_IF_ERROR(
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); runtime->BuildKernel("depthwise_conv2d", kernel_name,
built_options, kernel));
*kwg_size = *kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
std::string tuning_key = std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
*kernel = MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
runtime->BuildKernel("fully_connected", kernel_name, built_options); built_options, kernel));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
...@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap(); (*kernel_error)->UnMap();
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
...@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
default: default:
LOG(FATAL) << "Unknown activation type: " << activation; LOG(FATAL) << "Unknown activation type: " << activation;
} }
*kernel = MACE_RETURN_IF_ERROR(
runtime->BuildKernel("fully_connected", kernel_name, built_options); runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
uint32_t kwg_size = uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
...@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
gws->data(), *lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr); (*kernel_error)->Map(nullptr);
......
...@@ -245,6 +245,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -245,6 +245,9 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t cache_size = uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size(); OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
...@@ -254,10 +257,11 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, ...@@ -254,10 +257,11 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t lws_size = lws[1] * lws[2]; const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
void TuningOrRun3DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
...@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std::vector<uint32_t> internal_gws(gws, gws + 3); std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) { for (size_t i = 0; i < 3; ++i) {
MACE_CHECK(params[i] != 0);
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
} }
} }
...@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(internal_gws[0], internal_gws[1], gws2), cl::NDRange(internal_gws[0], internal_gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
...@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NullRange, kernel, cl::NullRange,
cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]), cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
timer->AccumulateTiming(); timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end()); tuning_result->assign(params.begin(), params.end());
...@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, 0, i * block_size), kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(internal_gws[0], internal_gws[1], gws2), cl::NDRange(internal_gws[0], internal_gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
timer->AccumulateTiming(); timer->AccumulateTiming();
} }
} }
...@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>( cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer); tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) { future->wait_fn = [event](CallStats *stats) {
...@@ -388,9 +394,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -388,9 +394,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
} }
}; };
} }
return MaceStatus::MACE_SUCCESS;
} }
void TuningOrRun2DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
...@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
std::vector<uint32_t> internal_gws(gws, gws + 2); std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) { if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) { for (size_t i = 0; i < 2; ++i) {
MACE_CHECK(params[i] != 0);
internal_gws[i] = RoundUp(gws[i], params[i]); internal_gws[i] = RoundUp(gws[i], params[i]);
} }
} }
...@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, i * block_size), kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internal_gws[0], gws1), cl::NDRange(internal_gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
} }
} else { } else {
timer->ClearTiming(); timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]), kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
timer->AccumulateTiming(); timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end()); tuning_result->assign(params.begin(), params.end());
...@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, i * block_size), kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internal_gws[0], gws1), cl::NDRange(internal_gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event); cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_ERROR(error);
timer->AccumulateTiming(); timer->AccumulateTiming();
} }
} }
...@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>( cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer); tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
...@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
}; };
} }
return MaceStatus::MACE_SUCCESS;
} }
} // namespace kernels } // namespace kernels
......
...@@ -65,13 +65,13 @@ std::string DtToCLDt(const DataType dt); ...@@ -65,13 +65,13 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
void TuningOrRun2DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
......
...@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_error_->UnMap(); kernel_error_->UnMap();
} }
} }
cl::Kernel b2f_kernel;
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options); obfuscated_kernel_name,
built_options,
&b2f_kernel));
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
kernel_error->UnMap(); kernel_error->UnMap();
} }
auto b2f_kernel = runtime->BuildKernel("buffer_to_image", cl::Kernel b2f_kernel;
obfuscated_kernel_name, built_options);
cl_int error = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options, &b2f_kernel);
if (error != CL_SUCCESS) {
return false;
}
uint32_t idx = 0; uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
...@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
const std::vector<uint32_t> lws = {16, kwg_size / 16}; const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event; cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
...@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]), b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event); cl::NDRange(lws[0], lws[1]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); if (error != CL_SUCCESS) {
return false;
}
runtime->command_queue().finish(); runtime->command_queue().finish();
bool is_out_of_range = false; bool is_out_of_range = false;
......
...@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("pad", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -25,7 +25,11 @@ namespace { ...@@ -25,7 +25,11 @@ namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = lws[2] =
...@@ -37,6 +41,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -37,6 +41,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} }
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size), lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws.data(), lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
*(kernel_error_->mutable_data<char>()) = 0; *(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap(); kernel_error_->UnMap();
} }
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("reduce_mean", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} }
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) { if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
...@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap(); kernel_error_->UnMap();
} }
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
......
...@@ -25,7 +25,11 @@ namespace kernels { ...@@ -25,7 +25,11 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -44,6 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -44,6 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} }
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size), lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1); 1);
}
return lws; return lws;
} }
...@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear", kernel_name, built_options); runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()( ...@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
} }
MACE_CHECK_CL_SUCCESS(error); MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>(); char *kerror_code = kernel_error_->mutable_data<char>();
......
...@@ -25,9 +25,13 @@ namespace kernels { ...@@ -25,9 +25,13 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) { if (gws[0] < base) {
lws[0] = gws[0]; lws[0] = gws[0];
...@@ -35,9 +39,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -35,9 +39,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
lws[0] = gws[0] / base; lws[0] = gws[0] / base;
} }
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]); lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::max<uint32_t>(std::min<uint32_t>(gws[2], lws[2] = std::max<uint32_t>(std::min<uint32_t>(
kwg_size / (lws[0] * lws[1])), gws[2], kwg_size / (lws[0] * lws[1])), 1);
1); }
return lws; return lws;
} }
...@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels); Concat("softmax_opencl_kernel", batch, height, width, channels);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
built_options); obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3)); batch_tensor->dim(2), batch_tensor->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) { if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP"); built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
} }
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
built_options); obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(0), output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(1),
output_tensor->dim(2)); output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
...@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
built_options); obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ = kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
...@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2)); output_tensor->dim(3), input_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) { if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr); kernel_error_->Map(nullptr);
......
...@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data, ...@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ", MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
strerror(errno)); strerror(errno));
} }
#ifdef MACE_ENABLE_OPENCL
MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
// Check OpenCL avaliable
auto runtime = OpenCLRuntime::Global();
if (!runtime->is_opencl_avaliable()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
// Check whether model max OpenCL image sizes exceed OpenCL limitation.
if (net_def == nullptr) {
return MaceStatus::MACE_INVALID_ARGS;
}
if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
auto opencl_max_image_size = runtime->GetMaxImage2DSize();
if (opencl_max_image_size.empty()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
const std::vector<int64_t> net_max_image_size =
ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
*net_def, "opencl_max_image_size", {0, 0});
if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
|| static_cast<uint64_t>(net_max_image_size[1])
> opencl_max_image_size[1]) {
LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
<< " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
return MaceStatus::MACE_SUCCESS;
}
#endif
} // namespace } // namespace
// Mace Tensor // Mace Tensor
...@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
const std::vector<std::string> &output_nodes, const std::vector<std::string> &output_nodes,
const unsigned char *model_data) { const unsigned char *model_data) {
LOG(INFO) << "Initializing MaceEngine"; LOG(INFO) << "Initializing MaceEngine";
// Check avalibility
#ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) {
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
}
#endif
// Get input and output information. // Get input and output information.
for (auto &input_info : net_def->input_info()) { for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info; input_info_map_[input_info.name()] = input_info;
......
...@@ -20,6 +20,12 @@ enum DataType { ...@@ -20,6 +20,12 @@ enum DataType {
DT_INT32 = 4; DT_INT32 = 4;
} }
enum MemoryType {
CPU_BUFFER = 0;
GPU_BUFFER = 1;
GPU_IMAGE = 2;
}
message ConstTensor { message ConstTensor {
repeated int64 dims = 1; repeated int64 dims = 1;
optional DataType data_type = 2 [default = DT_FLOAT]; optional DataType data_type = 2 [default = DT_FLOAT];
...@@ -73,8 +79,9 @@ message OperatorDef { ...@@ -73,8 +79,9 @@ message OperatorDef {
// for memory optimization // for memory optimization
message MemoryBlock { message MemoryBlock {
optional int32 mem_id = 1; optional int32 mem_id = 1;
optional uint32 x = 2; optional MemoryType mem_type = 2;
optional uint32 y = 3; optional uint32 x = 3;
optional uint32 y = 4;
} }
message MemoryArena { message MemoryArena {
repeated MemoryBlock mem_block = 1; repeated MemoryBlock mem_block = 1;
......
...@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory ...@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
std::unique_ptr<Impl> impl_; std::unique_ptr<Impl> impl_;
}; };
// Set Key-Value store factory. (Call Once) /// \brief Set internal storage factory to store internal data. (Call once)
// Now KVStorage is used to store the built OpenCL binaries to file, ///
// which could speed up the GPU initialization and first run. /// Now the path is used to store the built OpenCL binaries to file,
// If do not call this API, the initialization maybe slow for GPU. /// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory); void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
// Just call once. (Not thread-safe) /// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) ///
// if you use gpu of specific soc. /// Just call once. (Not thread-safe)
// Using OpenCL binary will speed up the initialization. /// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
// OpenCL binary is corresponding to the OpenCL Driver version, /// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed. /// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
void SetOpenCLBinaryPaths(const std::vector<std::string> &paths); void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
// Just call once. (Not thread-safe) /// \brief Set the path of Generated OpenCL parameter file
// Set the path of Generated OpenCL parameter file ///
// if you use gpu for specific soc. /// Just call once. (Not thread-safe)
// The parameters is the local work group size tuned for specific SOC, which /// If you use gpu for specific soc, The parameters is the local work group
// may be faster than the general parameters. /// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
void SetOpenCLParameterPath(const std::string &path); void SetOpenCLParameterPath(const std::string &path);
// Set GPU hints, currently only supports Adreno GPU. /// \brief Set GPU hints, currently only supports Adreno GPU.
// ///
// Caution: this function may hurt performance if improper parameters provided. /// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
// Set OpenMP threads number and affinity policy. /// \brief Set OpenMP threads number and affinity policy.
// ///
// Caution: this function may hurt performance if improper parameters provided. /// Caution: this function may hurt performance if improper parameters provided.
// /// When num_threads_hint is zero or negative,
// num_threads_hint is only a hint. When num_threads_hint is zero or negative, /// the function will set the threads number equaling to the number of
// the function will set the threads number equaling to the number of /// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all /// (AFFINITY_NONE) cores according to the policy. The threads number will
// (AFFINITY_NONE) cores according to the policy. The threads number will /// also be truncated to the corresponding cores number when num_threads_hint
// also be truncated to the corresponding cores number when num_threads_hint /// is larger than it.
// is larger than it. /// The OpenMP threads will be bind to (via sched_setaffinity) big cores
// /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
// The OpenMP threads will be bind to (via sched_setaffinity) big cores ///
// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). /// \param num_threads_hint it is only a hint.
// /// \param policy one of CPUAffinityPolicy
// If successful, it returns MACE_SUCCESS and error if it can't reliabley /// \param status MACE_SUCCESS for successful, or it can't reliabley
// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
// suggested to use AFFINITY_NONE to use all cores. /// suggested to use AFFINITY_NONE to use all cores.
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy); CPUAffinityPolicy policy);
// Set OpenMP threads number and processor affinity. /// \brief Set OpenMP threads number and processor affinity.
// ///
// Caution: this function may hurt performance if improper parameters provided. /// Caution: this function may hurt performance
// /// if improper parameters provided.
// This function may not work well on some chips (e.g. MTK). Setting thread /// This function may not work well on some chips (e.g. MTK). Setting thread
// affinity to offline cores may run very slow or unexpectedly. In such cases, /// affinity to offline cores may run very slow or unexpectedly.
// please use SetOpenMPThreadPolicy with default policy instead. /// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \param status
/// \return
__attribute__((visibility("default"))) __attribute__((visibility("default")))
MaceStatus SetOpenMPThreadAffinity(int num_threads, MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids); const std::vector<int> &cpu_ids);
// Get ARM big.LITTLE configuration. /// \brief Get ARM big.LITTLE configuration.
// ///
// This function will detect the max frequencies of all CPU cores, and assume /// This function will detect the max frequencies of all CPU cores, and assume
// the cores with largest max frequencies as big cores, and all the remaining /// the cores with largest max frequencies as big cores, and all the remaining
// cores as little. If all cpu core's max frequencies equals, big_core_ids and /// cores as little. If all cpu core's max frequencies equals, big_core_ids and
// little_core_ids will both be filled with all cpu core ids. /// little_core_ids will both be filled with all cpu core ids.
// ///
// If successful, it returns MACE_SUCCESS and error if it can't reliabley /// \param [out] big_core_ids
// detect the frequency of big-LITTLE cores (e.g. MTK). /// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__((visibility("default"))) __attribute__((visibility("default")))
MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids, MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids); std::vector<int> *little_core_ids);
......
...@@ -12,7 +12,72 @@ ...@@ -12,7 +12,72 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import enum
def mace_check(condition, msg): def mace_check(condition, msg):
if not condition: if not condition:
raise Exception(msg) raise Exception(msg)
def roundup_div4(value):
return int((value + 3) / 4)
class OpenCLBufferType(enum.Enum):
CONV2D_FILTER = 0
IN_OUT_CHANNEL = 1
ARGUMENT = 2
IN_OUT_HEIGHT = 3
IN_OUT_WIDTH = 4
WINOGRAD_FILTER = 5
DW_CONV2D_FILTER = 6
WEIGHT_HEIGHT = 7
WEIGHT_WIDTH = 8
def calculate_image_shape(buffer_type, shape, winograd_blk_size=0):
# keep the same with mace/kernel/opencl/helper.cc
image_shape = [0, 0]
if buffer_type == OpenCLBufferType.CONV2D_FILTER:
mace_check(len(shape) == 4, "Conv2D filter buffer should be 4D")
image_shape[0] = shape[1]
image_shape[1] = shape[2] * shape[3] * roundup_div4(shape[0])
elif buffer_type == OpenCLBufferType.IN_OUT_CHANNEL:
mace_check(len(shape) == 4, "Conv2D input/output buffer should be 4D")
image_shape[0] = roundup_div4(shape[3]) * shape[2]
image_shape[1] = shape[0] * shape[1]
elif buffer_type == OpenCLBufferType.ARGUMENT:
mace_check(len(shape) == 1,
"Argument buffer should be 1D not " + str(shape))
image_shape[0] = roundup_div4(shape[0])
image_shape[1] = 1
elif buffer_type == OpenCLBufferType.IN_OUT_HEIGHT:
mace_check(len(shape) == 4, "Input/output buffer should be 4D")
image_shape[0] = shape[2] * shape[3]
image_shape[1] = shape[0] * roundup_div4(shape[1])
elif buffer_type == OpenCLBufferType.IN_OUT_WIDTH:
mace_check(len(shape) == 4, "Input/output buffer should be 4D")
image_shape[0] = roundup_div4(shape[2]) * shape[3]
image_shape[1] = shape[0] * shape[1]
elif buffer_type == OpenCLBufferType.WINOGRAD_FILTER:
mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
image_shape[0] = roundup_div4(shape[1])
image_shape[1] = (shape[0] * (winograd_blk_size + 2)
* (winograd_blk_size + 2))
elif buffer_type == OpenCLBufferType.DW_CONV2D_FILTER:
mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
image_shape[0] = shape[0] * shape[2] * shape[3]
image_shape[1] = roundup_div4(shape[1])
elif buffer_type == OpenCLBufferType.WEIGHT_HEIGHT:
mace_check(len(shape) == 4, "Weight buffer should be 4D")
image_shape[0] = shape[1] * shape[2] * shape[3]
image_shape[1] = roundup_div4(shape[0])
elif buffer_type == OpenCLBufferType.WEIGHT_WIDTH:
mace_check(len(shape) == 4, "Weight buffer should be 4D")
image_shape[0] = roundup_div4(shape[1]) * shape[2] * shape[3]
image_shape[1] = shape[0]
else:
mace_check(False, "OpenCL Image do not support type "
+ str(buffer_type))
return image_shape
...@@ -171,6 +171,13 @@ def main(unused_args): ...@@ -171,6 +171,13 @@ def main(unused_args):
output_graph_def.op.extend(cpu_graph_def.op) output_graph_def.op.extend(cpu_graph_def.op)
output_graph_def.mem_arena.mem_block.extend( output_graph_def.mem_arena.mem_block.extend(
cpu_graph_def.mem_arena.mem_block) cpu_graph_def.mem_arena.mem_block)
output_graph_arg_names = set()
for arg in output_graph_def.arg:
output_graph_arg_names.add(arg.name)
for arg in cpu_graph_def.arg:
if arg.name not in output_graph_arg_names:
output_graph_def.arg.extend(arg)
print "Merge done" print "Merge done"
else: else:
option.device = device_type_map[FLAGS.runtime] option.device = device_type_map[FLAGS.runtime]
......
...@@ -163,6 +163,7 @@ class MaceKeyword(object): ...@@ -163,6 +163,7 @@ class MaceKeyword(object):
mace_op_data_type_str = 'T' mace_op_data_type_str = 'T'
mace_offset_str = 'offset' mace_offset_str = 'offset'
mace_from_caffe_str = 'from_caffe' mace_from_caffe_str = 'from_caffe'
mace_opencl_max_image_size = "opencl_max_image_size"
class TransformerRule(Enum): class TransformerRule(Enum):
......
...@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword ...@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import TransformerRule from mace.python.tools.converter_tool.base_converter import TransformerRule
from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import mace_check from mace.python.tools.convert_util import mace_check
from mace.python.tools.convert_util import OpenCLBufferType
OPENCL_IMAGE_MAX_SIZE = 16384
class OpenCLBufferType(enum.Enum): OPENCL_IMAGE_MAX_SIZE = 16384
CONV2D_FILTER = 0
IN_OUT_CHANNEL = 1
ARGUMENT = 2
IN_OUT_HEIGHT = 3
IN_OUT_WIDTH = 4
WINOGRAD_FILTER = 5
DW_CONV2D_FILTER = 6
WEIGHT_HEIGHT = 7
WEIGHT_WIDTH = 8
class Transformer(base_converter.ConverterInterface): class Transformer(base_converter.ConverterInterface):
...@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
self._producer = {} self._producer = {}
self._target_data_format = DataFormat.NHWC self._target_data_format = DataFormat.NHWC
self._input_output_added = False self._input_output_added = False
self._opencl_max_image_size = [0, 0]
if self._option.device == DeviceType.CPU.value: if self._option.device == DeviceType.CPU.value:
self._target_data_format = DataFormat.NCHW self._target_data_format = DataFormat.NCHW
...@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface): ...@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
arg.name = MaceKeyword.mace_mode arg.name = MaceKeyword.mace_mode
arg.i = 0 arg.i = 0
tensor_shape = list(self._consts[input_name].dims)
if input_type == OpenCLBufferType.WINOGRAD_FILTER: if input_type == OpenCLBufferType.WINOGRAD_FILTER:
blk_sqr = op.output_shape[0].dims[0] blk_sqr = op.output_shape[0].dims[0]
wino_blk = int(np.sqrt(blk_sqr)) - 2 wino_blk = int(np.sqrt(blk_sqr)) - 2
wino_arg = op_def.arg.add() wino_arg = op_def.arg.add()
wino_arg.name = MaceKeyword.mace_wino_block_size wino_arg.name = MaceKeyword.mace_wino_block_size
wino_arg.i = wino_blk wino_arg.i = wino_blk
img_shape = calculate_image_shape(input_type, tensor_shape,
wino_blk)
else:
img_shape = calculate_image_shape(input_type, tensor_shape)
op.input[input_idx] = output_name op.input[input_idx] = output_name
# update OpenCL max image size
self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0],
img_shape[0])
self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
img_shape[1])
def transform_buffer_image(self): def transform_buffer_image(self):
if self._option.device != DeviceType.GPU.value: if self._option.device != DeviceType.GPU.value:
return False return False
...@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
# Add OpenCL max image size
arg = net.arg.add()
arg.name = MaceKeyword.mace_opencl_max_image_size
arg.ints.extend(self._opencl_max_image_size)
for input_node in self._option.input_nodes.values(): for input_node in self._option.input_nodes.values():
new_input_name = MaceKeyword.mace_input_node_name \ new_input_name = MaceKeyword.mace_input_node_name \
+ '_' + input_node.name + '_' + input_node.name
......
...@@ -16,6 +16,24 @@ import sys ...@@ -16,6 +16,24 @@ import sys
import operator import operator
from mace.proto import mace_pb2 from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import OpenCLBufferType
class MemoryBlock(object):
def __init__(self, mem_type, block):
self._mem_type = mem_type
self._block = block
@property
def mem_type(self):
return self._mem_type
@property
def block(self):
return self._block
class MemoryOptimizer(object): class MemoryOptimizer(object):
def __init__(self, net_def): def __init__(self, net_def):
...@@ -24,7 +42,6 @@ class MemoryOptimizer(object): ...@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
self.op_mem = {} # op_name->mem_id self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[size] or mem_id->[x, y] self.mem_block = {} # mem_id->[size] or mem_id->[x, y]
self.total_mem_count = 0 self.total_mem_count = 0
self.total_cpu_mem_count = 0
self.input_ref_counter = {} self.input_ref_counter = {}
self.mem_ref_counter = {} self.mem_ref_counter = {}
...@@ -52,23 +69,27 @@ class MemoryOptimizer(object): ...@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
return True return True
def get_op_mem_block(self, op_type, output_shape): def get_op_mem_block(self, op_type, output_shape):
return [reduce(operator.mul, output_shape, 1)] return MemoryBlock(mace_pb2.CPU_BUFFER,
[reduce(operator.mul, output_shape, 1)])
def mem_size(self, memory_block): def mem_size(self, memory_block):
return memory_block[0] return memory_block.block[0]
def sub_mem_block(self, mem_block1, mem_block2): def sub_mem_block(self, mem_block1, mem_block2):
return self.mem_size(mem_block1) - self.mem_size(mem_block2) return self.mem_size(mem_block1) - self.mem_size(mem_block2)
def resize_mem_block(self, old_mem_block, op_mem_block): def resize_mem_block(self, old_mem_block, op_mem_block):
return [max(old_mem_block[0], op_mem_block[0])] return MemoryBlock(
old_mem_block.mem_type,
[max(old_mem_block.block[0], op_mem_block.block[0])])
def add_net_mem_blocks(self): def add_net_mem_blocks(self):
for mem in self.mem_block: for mem in self.mem_block:
arena = self.net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.x = self.mem_block[mem][0] block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0]
block.y = 1 block.y = 1
def get_total_origin_mem_size(self): def get_total_origin_mem_size(self):
...@@ -82,7 +103,7 @@ class MemoryOptimizer(object): ...@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
def get_total_optimized_mem_size(self): def get_total_optimized_mem_size(self):
optimized_mem_size = 0 optimized_mem_size = 0
for mem in self.mem_block: for mem in self.mem_block:
print mem, self.mem_block[mem] print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block
optimized_mem_size += self.mem_size(self.mem_block[mem]) optimized_mem_size += self.mem_size(self.mem_block[mem])
return optimized_mem_size return optimized_mem_size
...@@ -117,6 +138,8 @@ class MemoryOptimizer(object): ...@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
best_mem_waste_size = sys.maxint best_mem_waste_size = sys.maxint
for mid in self.idle_mem: for mid in self.idle_mem:
old_mem_block = self.mem_block[mid] old_mem_block = self.mem_block[mid]
if old_mem_block.mem_type != op_mem_block.mem_type:
continue
new_mem_block = self.resize_mem_block( new_mem_block = self.resize_mem_block(
old_mem_block, op_mem_block) old_mem_block, op_mem_block)
add_mem_size = self.sub_mem_block(new_mem_block, add_mem_size = self.sub_mem_block(new_mem_block,
...@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer): ...@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for arg in op.arg: for arg in op.arg:
if arg.name == 'mode' and arg.i == 0: if arg.name == 'mode' and arg.i == 0:
return False return False
elif op.type == 'Shape':
for i in range(len(op.output)):
mem_id = self.total_cpu_mem_count
self.total_cpu_mem_count += 1
op_mem_block = self.get_op_mem_block(
op.type,
op.output_shape[i].dims)
self.mem_block[mem_id] = op_mem_block
return False
return op.type != 'ImageToBuffer' return op.type != 'ImageToBuffer'
def get_op_mem_block(self, op_type, output_shape): def get_op_mem_block(self, op_type, output_shape):
mem_block = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul': if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_block[0] = output_shape[2] buffer_shape = list(output_shape) + [1]
mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4) mem_block = MemoryBlock(
mace_pb2.GPU_IMAGE,
calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
buffer_shape))
elif op_type == 'Shape': elif op_type == 'Shape':
mem_block[0] = output_shape[0] mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
mem_block[1] = 1 [output_shape[0], 1])
else: else:
if len(output_shape) == 2: # only support fc/softmax if len(output_shape) == 2: # only support fc/softmax
mem_block[0] = int((output_shape[1] + 3) / 4) buffer_shape = [output_shape[0], 1, 1, output_shape[1]]
mem_block[1] = output_shape[0]
elif len(output_shape) == 4: elif len(output_shape) == 4:
mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4) buffer_shape = output_shape
mem_block[1] = output_shape[0] * output_shape[1]
else: else:
raise Exception('output shape dim size is not 2 or 4.') raise Exception('output shape dim size is not 2 or 4.')
mem_block = MemoryBlock(
mace_pb2.GPU_IMAGE,
calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
buffer_shape))
return mem_block return mem_block
def mem_size(self, memory_block): def mem_size(self, memory_block):
return memory_block[0] * memory_block[1] * 4 if memory_block.mem_type == mace_pb2.GPU_IMAGE:
return memory_block.block[0] * memory_block.block[1] * 4
else:
return memory_block.block[0]
def resize_mem_block(self, old_mem_block, op_mem_block): def resize_mem_block(self, old_mem_block, op_mem_block):
resize_mem_block = [ resize_mem_block = MemoryBlock(
max(old_mem_block[0], op_mem_block[0]), old_mem_block.mem_type,
max(old_mem_block[1], op_mem_block[1]) [
] max(old_mem_block.block[0], op_mem_block.block[0]),
max(old_mem_block.block[1], op_mem_block.block[1])
])
return resize_mem_block return resize_mem_block
def add_net_mem_blocks(self): def add_net_mem_blocks(self):
max_image_size_x = 0
max_image_size_y = 0
for mem in self.mem_block: for mem in self.mem_block:
arena = self.net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.x = self.mem_block[mem][0] block.mem_type = self.mem_block[mem].mem_type
block.y = self.mem_block[mem][1] block.x = self.mem_block[mem].block[0]
block.y = self.mem_block[mem].block[1]
if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
max_image_size_x = max(max_image_size_x, block.x)
max_image_size_y = max(max_image_size_y, block.y)
# Update OpenCL max image size
net_ocl_max_img_size_arg = None
for arg in self.net_def.arg:
if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
net_ocl_max_img_size_arg = arg
max_image_size_x = max(arg.ints[0], max_image_size_x)
max_image_size_y = max(arg.ints[1], max_image_size_y)
break
if net_ocl_max_img_size_arg is None:
net_ocl_max_img_size_arg = self.net_def.arg.add()
net_ocl_max_img_size_arg.name = \
cvt.MaceKeyword.mace_opencl_max_image_size
net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
max_image_size_y]
def mem_id_base(self): def mem_id_base(self):
return 20000 return 20000
......
...@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) { ...@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block(); mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}}); mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}}); mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}}); mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
......
...@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) { for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y); mem_blk_ptr->set_y(in_mem_block_y);
res[input_names[i]] = mem_id; res[input_names[i]] = mem_id;
...@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) { for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y); mem_blk_ptr->set_y(out_mem_block_y);
res[output_names[i]] = mem_id; res[output_names[i]] = mem_id;
......
...@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) { for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x); mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y); mem_blk_ptr->set_y(in_mem_block_y);
res[input_names[i]] = mem_id; res[input_names[i]] = mem_id;
...@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization( ...@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) { for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id); mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x); mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y); mem_blk_ptr->set_y(out_mem_block_y);
res[output_names[i]] = mem_id; res[output_names[i]] = mem_id;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册