提交 88120708 编写于 作者: L liuqi

Add gpu avalibility check and return status to user if gpu call failed.

上级 d9a58a5e
......@@ -37,8 +37,8 @@ int FileStorage::Load() {
struct stat st;
if (stat(file_path_.c_str(), &st) == -1) {
if (errno == ENOENT) {
LOG(INFO) << "File " << file_path_
<< " does not exist";
VLOG(1) << "File " << file_path_
<< " does not exist";
return 0;
} else {
LOG(WARNING) << "Stat file " << file_path_
......
......@@ -123,7 +123,10 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
offset, nbytes, nullptr, nullptr, &error);
MACE_CHECK_CL_SUCCESS(error);
if (error != CL_SUCCESS) {
LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
mapped_ptr = nullptr;
}
return mapped_ptr;
}
......@@ -142,8 +145,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error);
MACE_CHECK_CL_SUCCESS(error);
if (error != CL_SUCCESS) {
LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
mapped_ptr = nullptr;
}
return mapped_ptr;
}
......@@ -152,7 +157,9 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto queue = OpenCLRuntime::Global()->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
nullptr, nullptr);
MACE_CHECK_CL_SUCCESS(error);
if (error != CL_SUCCESS) {
LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error);
}
}
bool OpenCLAllocator::OnHost() const { return false; }
......
......@@ -307,11 +307,15 @@ void OpenCLRuntime::ConfigureOpenCLBinaryPath(
OpenCLRuntime::OpenCLRuntime():
precompiled_binary_storage_(nullptr),
cache_storage_(nullptr),
is_profiling_enabled_(false) {
is_opencl_avaliable_(false),
is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN) {
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(FATAL) << "No OpenCL platforms found";
LOG(ERROR) << "No OpenCL platforms found";
return;
}
cl::Platform default_platform = all_platforms[0];
std::stringstream ss;
......@@ -325,7 +329,8 @@ OpenCLRuntime::OpenCLRuntime():
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(FATAL) << "No OpenCL devices found";
LOG(ERROR) << "No OpenCL devices found";
return;
}
bool gpu_detected = false;
......@@ -340,13 +345,17 @@ OpenCLRuntime::OpenCLRuntime():
const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
opencl_version_ = ParseDeviceVersion(device_version);
if (opencl_version_ == OpenCLVersion::CL_VER_UNKNOWN) {
return;
}
VLOG(1) << "Using device: " << device_name;
break;
}
}
if (!gpu_detected) {
LOG(FATAL) << "No GPU device found";
LOG(ERROR) << "No GPU device found";
return;
}
cl_command_queue_properties properties = 0;
......@@ -384,13 +393,19 @@ OpenCLRuntime::OpenCLRuntime():
new cl::Context({*device_}, nullptr, nullptr, nullptr, &err));
}
}
MACE_CHECK_CL_SUCCESS(err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return;
}
command_queue_ = std::make_shared<cl::CommandQueue>(*context_,
*device_,
properties,
&err);
MACE_CHECK_CL_SUCCESS(err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return;
}
extern std::shared_ptr<KVStorageFactory> kStorageFactory;
std::string cached_binary_platform_info;
......@@ -416,10 +431,7 @@ OpenCLRuntime::OpenCLRuntime():
}
if (cached_binary_platform_info != platform_info_) {
if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
LOG(WARNING) << "There is no precompiled OpenCL binary in"
" all OpenCL binary paths";
} else {
if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
precompiled_binary_storage_.reset(
new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
if (precompiled_binary_storage_->Load() != 0) {
......@@ -450,6 +462,8 @@ OpenCLRuntime::OpenCLRuntime():
} else {
this->out_of_range_check_ = false;
}
is_opencl_avaliable_ = true;
}
OpenCLRuntime::~OpenCLRuntime() {
......@@ -460,6 +474,12 @@ OpenCLRuntime::~OpenCLRuntime() {
device_.reset();
}
bool OpenCLRuntime::is_opencl_avaliable() {
static const uint64_t kMinWorkGroupSize = 64;
return is_opencl_avaliable_
&& GetDeviceMaxWorkGroupSize() >= kMinWorkGroupSize;
}
cl::Context &OpenCLRuntime::context() { return *context_; }
cl::Device &OpenCLRuntime::device() { return *device_; }
......@@ -538,7 +558,7 @@ bool OpenCLRuntime::BuildProgramFromPrecompiledBinary(
return true;
}
void OpenCLRuntime::BuildProgramFromSource(
bool OpenCLRuntime::BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
......@@ -562,7 +582,7 @@ void OpenCLRuntime::BuildProgramFromSource(
LOG(WARNING) << "Build program "
<< program_name << " from source failed: "
<< MakeString(ret);
return;
return false;
}
// Keep built program binary
......@@ -572,7 +592,10 @@ void OpenCLRuntime::BuildProgramFromSource(
cl_int err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * device_list_size,
program_binary_sizes.get(), nullptr);
MACE_CHECK_CL_SUCCESS(err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) {
......@@ -583,7 +606,10 @@ void OpenCLRuntime::BuildProgramFromSource(
err = clGetProgramInfo((*program)(), CL_PROGRAM_BINARIES,
sizeof(unsigned char *) * device_list_size,
program_binaries.get(), nullptr);
MACE_CHECK_CL_SUCCESS(err);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
std::vector<unsigned char> content(
reinterpret_cast<unsigned char const *>(program_binaries[0].get()),
reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
......@@ -600,9 +626,10 @@ void OpenCLRuntime::BuildProgramFromSource(
VLOG(3) << "Program from source: " << built_program_key;
}
return true;
}
void OpenCLRuntime::BuildProgram(const std::string &program_name,
bool OpenCLRuntime::BuildProgram(const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options,
cl::Program *program) {
......@@ -617,16 +644,18 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
ret = BuildProgramFromPrecompiledBinary(built_program_key,
build_options_str, program);
if (!ret) {
BuildProgramFromSource(program_name, built_program_key,
build_options_str, program);
ret = BuildProgramFromSource(program_name, built_program_key,
build_options_str, program);
}
}
return ret;
}
cl::Kernel OpenCLRuntime::BuildKernel(
MaceStatus OpenCLRuntime::BuildKernel(
const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
const std::set<std::string> &build_options,
cl::Kernel *kernel) {
std::string build_options_str;
for (auto &option : build_options) {
build_options_str += " " + option;
......@@ -639,11 +668,17 @@ cl::Kernel OpenCLRuntime::BuildKernel(
if (built_program_it != built_program_map_.end()) {
program = built_program_it->second;
} else {
this->BuildProgram(program_name, built_program_key, build_options_str,
&program);
bool ret = this->BuildProgram(program_name, built_program_key,
build_options_str, &program);
if (!ret) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
built_program_map_.emplace(built_program_key, program);
}
return cl::Kernel(program, kernel_name.c_str());
cl_int err;
*kernel = cl::Kernel(program, kernel_name.c_str(), &err);
MACE_CL_RET_STATUS(err);
return MaceStatus::MACE_SUCCESS;
}
void OpenCLRuntime::SaveBuiltCLProgram() {
......@@ -667,25 +702,67 @@ void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
cl_int err = device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size;
}
uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
cl_int err = device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size;
}
bool OpenCLRuntime::IsImageSupport() {
cl_bool res;
cl_int err = device_->getInfo(CL_DEVICE_IMAGE_SUPPORT, &res);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return false;
}
return res == CL_TRUE;
}
std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
size_t max_height, max_width;
cl_int err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_HEIGHT, &max_height);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {};
}
err = device_->getInfo(CL_DEVICE_IMAGE2D_MAX_WIDTH, &max_width);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {};
}
return {max_height, max_width};
}
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE,
&size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size;
}
uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
cl_int err = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM,
&size);
if (err != CL_SUCCESS) {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
size = 0;
}
return size;
}
......@@ -717,8 +794,8 @@ OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
} else if (words[1] == "1.0") {
return OpenCLVersion::CL_VER_1_0;
} else {
LOG(FATAL) << "Do not support OpenCL version: " << words[1];
return OpenCLVersion::CL_VER_1_0;
LOG(ERROR) << "Do not support OpenCL version: " << words[1];
return OpenCLVersion::CL_VER_UNKNOWN;
}
}
......
......@@ -42,13 +42,23 @@ enum OpenCLVersion {
CL_VER_1_1,
CL_VER_1_2,
CL_VER_2_0,
CL_VER_UNKNOWN,
};
const std::string OpenCLErrorToString(cl_int error);
#define MACE_CHECK_CL_SUCCESS(error) \
MACE_CHECK(error == CL_SUCCESS) << "error: " << OpenCLErrorToString(error)
#define MACE_CL_RET_ERROR(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return error; \
}
#define MACE_CL_RET_STATUS(error) \
if (error != CL_SUCCESS) { \
LOG(ERROR) << "error: " << OpenCLErrorToString(error); \
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class OpenCLProfilingTimer : public Timer {
public:
......@@ -81,19 +91,23 @@ class OpenCLRuntime {
const std::string platform_info() const;
uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const;
bool is_opencl_avaliable();
void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetDeviceMaxMemAllocSize();
bool IsImageSupport();
std::vector<uint64_t> GetMaxImage2DSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
bool IsNonUniformWorkgroupsSupported() const;
bool IsOutOfRangeCheckEnabled() const;
bool is_profiling_enabled() const;
cl::Kernel BuildKernel(const std::string &program_name,
MaceStatus BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
const std::set<std::string> &build_options,
cl::Kernel *kernel);
void SaveBuiltCLProgram();
......@@ -103,7 +117,7 @@ class OpenCLRuntime {
OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
void BuildProgram(const std::string &program_file_name,
bool BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name,
const std::string &build_options,
cl::Program *program);
......@@ -115,7 +129,7 @@ class OpenCLRuntime {
const std::string &built_program_key,
const std::string &build_options_str,
cl::Program *program);
void BuildProgramFromSource(
bool BuildProgramFromSource(
const std::string &program_name,
const std::string &built_program_key,
const std::string &build_options_str,
......@@ -125,6 +139,7 @@ class OpenCLRuntime {
private:
std::unique_ptr<KVStorage> precompiled_binary_storage_;
std::unique_ptr<KVStorage> cache_storage_;
bool is_opencl_avaliable_;
bool is_profiling_enabled_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
......
......@@ -204,26 +204,28 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (device_type == DeviceType::GPU) {
// TODO(liuqi): refactor based on PB
if (mem_block.mem_id() >= 20000) {
std::unique_ptr<BufferBase> image_buf(
new Image());
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
}
} else {
if (mem_block.mem_id() < 20000) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(device_type)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::CPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image());
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::GPU)));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
VLOG(3) << "Preallocate buffer to tensors";
......
......@@ -219,7 +219,10 @@ bool RunModel(const std::vector<std::string> &input_names,
#endif
if (create_engine_status != MaceStatus::MACE_SUCCESS) {
std::cerr << "Create engine error, please check the arguments" << std::endl;
std::cerr << "Create engine error, please check the arguments first, "
<< "if correct, the device may not run the model, "
<< "please fall back to other strategy."
<< std::endl;
exit(1);
}
......
......@@ -79,7 +79,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
default:
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("activation", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -115,7 +116,8 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -68,7 +68,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("addn", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -111,7 +112,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -88,7 +88,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("batch_norm", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -122,7 +123,8 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -61,7 +61,8 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("bias_add", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -102,7 +103,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -106,8 +106,10 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
}
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options);
cl::Kernel b2f_kernel;
MACE_RETURN_IF_ERROR(runtime->BuildKernel(
"buffer_to_image", obfuscated_kernel_name, built_options, &b2f_kernel));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -164,7 +166,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -62,8 +62,9 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("channel_shuffle", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -92,7 +93,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -24,12 +24,18 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws;
}
......@@ -83,7 +89,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
if (input0->dim(3) % 4 == 0) {
built_options.emplace("-DDIVISIBLE_FOUR");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -114,7 +121,8 @@ static MaceStatus Concat2(cl::Kernel *kernel,
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......@@ -157,7 +165,8 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("concat", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
}
......@@ -207,7 +216,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
char *kerror_code = (*kernel_error)->mutable_data<char>();
......
......@@ -27,30 +27,36 @@ const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
lws[0] = std::min<uint32_t>(gws[0], base);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
lws[0] = gws[0] / 8;
if (lws[0] < base) {
lws[0] = std::max<uint32_t>(gws[0] / 4, base);
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] < base) {
lws[0] = std::max<uint32_t>(gws[0] / 4, base);
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8,
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
(cache_size / kernel_cache_size / lws_size / compute_units) * 8, gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
......@@ -130,7 +136,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_1x1", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -173,7 +180,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -26,25 +26,30 @@ namespace {
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base =
std::max<uint32_t>(
std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] =
std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(
cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
const uint32_t base =
std::max<uint32_t>(
std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] =
std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>(
RoundUp<uint32_t>(
cache_size / kernel_cache_size / lws_size / compute_units, base),
gws[2]);
if (lws[2] == 0) {
lws[2] = std::min<uint32_t>(gws[2], base);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
......@@ -115,7 +120,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d_3x3", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -161,7 +167,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -30,30 +30,35 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const uint32_t kernel_size,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
lws_size / compute_units) *
8,
gws[2]);
if (lws[2] == 0) {
if (gws[2] < lws_limit) {
lws[2] = gws[2];
} else {
lws[2] = base;
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / kernel_size /
lws_size / compute_units) *
8,
gws[2]);
if (lws[2] == 0) {
if (gws[2] < lws_limit) {
lws[2] = gws[2];
} else {
lws[2] = base;
}
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
......@@ -124,7 +129,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("conv_2d", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -173,7 +179,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -24,12 +24,18 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] =
std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
}
return lws;
}
......@@ -147,7 +153,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("crop", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("crop", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -181,7 +188,8 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -95,7 +95,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel = runtime->BuildKernel("deconv_2d", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("deconv_2d", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -148,7 +149,8 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -95,8 +95,10 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("depth_to_space", obfuscated_kernel_name,
built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("depth_to_space",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -135,7 +137,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -26,27 +26,33 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
if (lws[0] < base) {
lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, base),
kwg_size / lws[1]);
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
if (lws[0] < base) {
lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, base),
kwg_size / lws[1]);
}
}
lws[0] =
std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws[1]), 1);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
gws[2]);
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
......@@ -129,8 +135,9 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("depthwise_conv2d", kernel_name,
built_options, kernel));
*kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -183,7 +190,8 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::string tuning_key =
Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -103,7 +103,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("eltwise", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -141,7 +142,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -84,8 +84,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
*kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
......@@ -160,7 +160,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
(*kernel_error)->UnMap();
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
......@@ -230,8 +230,9 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
default:
LOG(FATAL) << "Unknown activation type: " << activation;
}
*kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("fully_connected", kernel_name,
built_options, kernel));
uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
......@@ -272,7 +273,8 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
gws->data(), *lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
(*kernel_error)->Map(nullptr);
......
......@@ -245,23 +245,27 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
1);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
1);
}
return lws;
}
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
......@@ -318,6 +322,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
std::vector<uint32_t> internal_gws(gws, gws + 3);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 3; ++i) {
MACE_CHECK(params[i] != 0);
internal_gws[i] = RoundUp(gws[i], params[i]);
}
}
......@@ -336,7 +341,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(internal_gws[0], internal_gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
}
} else {
timer->ClearTiming();
......@@ -344,7 +349,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NullRange,
cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end());
......@@ -369,7 +374,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, 0, i * block_size),
cl::NDRange(internal_gws[0], internal_gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
timer->AccumulateTiming();
}
}
......@@ -377,8 +382,9 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
......@@ -388,13 +394,14 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
};
}
return MaceStatus::MACE_SUCCESS;
}
void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
......@@ -424,6 +431,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
std::vector<uint32_t> internal_gws(gws, gws + 2);
if (!runtime->IsNonUniformWorkgroupsSupported()) {
for (size_t i = 0; i < 2; ++i) {
MACE_CHECK(params[i] != 0);
internal_gws[i] = RoundUp(gws[i], params[i]);
}
}
......@@ -442,14 +450,14 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internal_gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
}
} else {
timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end());
......@@ -474,7 +482,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
kernel, cl::NDRange(0, i * block_size),
cl::NDRange(internal_gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_ERROR(error);
timer->AccumulateTiming();
}
}
......@@ -482,8 +490,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
......@@ -492,6 +502,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
};
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace kernels
......
......@@ -65,17 +65,17 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) {
if (future != nullptr) {
......
......@@ -97,9 +97,11 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
kernel_error_->UnMap();
}
}
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options);
cl::Kernel b2f_kernel;
MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options,
&b2f_kernel));
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -151,7 +153,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -74,7 +74,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -99,7 +100,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -64,8 +64,14 @@ bool BufferToImageOpImpl(Tensor *buffer,
kernel_error->UnMap();
}
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name, built_options);
cl::Kernel b2f_kernel;
cl_int error = runtime->BuildKernel("buffer_to_image",
obfuscated_kernel_name,
built_options, &b2f_kernel);
if (error != CL_SUCCESS) {
return false;
}
uint32_t idx = 0;
if (runtime->IsOutOfRangeCheckEnabled()) {
......@@ -92,7 +98,6 @@ bool BufferToImageOpImpl(Tensor *buffer,
const std::vector<uint32_t> lws = {16, kwg_size / 16};
cl::Event event;
cl_int error;
if (runtime->IsNonUniformWorkgroupsSupported()) {
error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
......@@ -107,7 +112,9 @@ bool BufferToImageOpImpl(Tensor *buffer,
b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
cl::NDRange(lws[0], lws[1]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
if (error != CL_SUCCESS) {
return false;
}
runtime->command_queue().finish();
bool is_out_of_range = false;
......
......@@ -68,7 +68,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("pad", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pad", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -104,7 +105,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -25,18 +25,23 @@ namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
const uint32_t lws_size = lws[1] * lws[2];
lws[0] = gws[0] / 4;
if (lws[0] == 0) {
lws[0] = gws[0];
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
1);
}
lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
1);
return lws;
}
......@@ -80,7 +85,10 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("pooling",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -160,7 +168,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws.data(), lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -66,13 +66,17 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
*(kernel_error_->mutable_data<char>()) = 0;
kernel_error_->UnMap();
}
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("reduce_mean", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("reduce_mean",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
}
if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
......@@ -135,13 +139,13 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code;
kernel_error_->UnMap();
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
......
......@@ -25,25 +25,30 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
lws[0] = std::min<uint32_t>(gws[0], base);
} else {
lws[0] = gws[0] / 8;
if (lws[0] == 0) {
lws[0] = gws[0];
}
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
const uint32_t lws_size = lws[0] * lws[1];
lws[2] = gws[2] / 8;
if (lws[2] == 0) {
lws[2] = gws[2];
}
lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
1);
return lws;
}
......@@ -86,8 +91,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
MACE_RETURN_IF_ERROR(
runtime->BuildKernel("resize_bilinear",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -131,7 +139,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -61,7 +61,10 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("slice",
kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -107,7 +110,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
}
MACE_CHECK_CL_SUCCESS(error);
MACE_CL_RET_STATUS(error);
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
char *kerror_code = kernel_error_->mutable_data<char>();
......
......@@ -25,19 +25,23 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
std::vector<uint32_t> lws(4, 0);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
lws[0] = gws[0] / base;
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
lws[0] = gws[0];
} else {
lws[0] = gws[0] / base;
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::max<uint32_t>(std::min<uint32_t>(
gws[2], kwg_size / (lws[0] * lws[1])), 1);
}
lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
lws[2] = std::max<uint32_t>(std::min<uint32_t>(gws[2],
kwg_size / (lws[0] * lws[1])),
1);
return lws;
}
......@@ -95,7 +99,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("softmax", kernel_name,
built_options, &kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -122,7 +127,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -77,8 +77,10 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("space_to_batch", obfuscated_kernel_name,
built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("space_to_batch",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -118,7 +120,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -59,8 +59,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if (runtime->IsNonUniformWorkgroupsSupported()) {
built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
}
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -134,7 +136,8 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......@@ -211,8 +214,10 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform",
obfuscated_kernel_name,
built_options,
&kernel_));
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......@@ -267,7 +272,8 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2));
TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
gws, lws, future));
if (runtime->IsOutOfRangeCheckEnabled()) {
kernel_error_->Map(nullptr);
......
......@@ -61,6 +61,44 @@ void UnloadModelData(const unsigned char *model_data,
MACE_CHECK(ret == 0, "Failed to unmap model data file, error code: ",
strerror(errno));
}
#ifdef MACE_ENABLE_OPENCL
MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
// Check OpenCL avaliable
auto runtime = OpenCLRuntime::Global();
if (!runtime->is_opencl_avaliable()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
// Check whether model max OpenCL image sizes exceed OpenCL limitation.
if (net_def == nullptr) {
return MaceStatus::MACE_INVALID_ARGS;
}
if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
auto opencl_max_image_size = runtime->GetMaxImage2DSize();
if (opencl_max_image_size.empty()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
const std::vector<int64_t> net_max_image_size =
ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
*net_def, "opencl_max_image_size", {0, 0});
if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
|| static_cast<uint64_t>(net_max_image_size[1])
> opencl_max_image_size[1]) {
LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
<< " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
return MaceStatus::MACE_SUCCESS;
}
#endif
} // namespace
// Mace Tensor
......@@ -171,6 +209,12 @@ MaceStatus MaceEngine::Impl::Init(
const std::vector<std::string> &output_nodes,
const unsigned char *model_data) {
LOG(INFO) << "Initializing MaceEngine";
// Check avalibility
#ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) {
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def));
}
#endif
// Get input and output information.
for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info;
......
......@@ -20,6 +20,12 @@ enum DataType {
DT_INT32 = 4;
}
enum MemoryType {
CPU_BUFFER = 0;
GPU_BUFFER = 1;
GPU_IMAGE = 2;
}
message ConstTensor {
repeated int64 dims = 1;
optional DataType data_type = 2 [default = DT_FLOAT];
......@@ -73,8 +79,9 @@ message OperatorDef {
// for memory optimization
message MemoryBlock {
optional int32 mem_id = 1;
optional uint32 x = 2;
optional uint32 y = 3;
optional MemoryType mem_type = 2;
optional uint32 x = 3;
optional uint32 y = 4;
}
message MemoryArena {
repeated MemoryBlock mem_block = 1;
......
......@@ -79,77 +79,102 @@ class __attribute__((visibility("default"))) FileStorageFactory
std::unique_ptr<Impl> impl_;
};
// Set Key-Value store factory. (Call Once)
// Now KVStorage is used to store the built OpenCL binaries to file,
// which could speed up the GPU initialization and first run.
// If do not call this API, the initialization maybe slow for GPU.
/// \brief Set internal storage factory to store internal data. (Call once)
///
/// Now the path is used to store the built OpenCL binaries to file,
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default")))
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
// Just call once. (Not thread-safe)
// Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so)
// if you use gpu of specific soc.
// Using OpenCL binary will speed up the initialization.
// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
///
/// Just call once. (Not thread-safe)
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
/// OpenCL binary is corresponding to the OpenCL Driver version,
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__((visibility("default")))
void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
// Just call once. (Not thread-safe)
// Set the path of Generated OpenCL parameter file
// if you use gpu for specific soc.
// The parameters is the local work group size tuned for specific SOC, which
// may be faster than the general parameters.
/// \brief Set the path of Generated OpenCL parameter file
///
/// Just call once. (Not thread-safe)
/// If you use gpu for specific soc, The parameters is the local work group
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default")))
void SetOpenCLParameterPath(const std::string &path);
// Set GPU hints, currently only supports Adreno GPU.
//
// Caution: this function may hurt performance if improper parameters provided.
/// \brief Set GPU hints, currently only supports Adreno GPU.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__((visibility("default")))
void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
// Set OpenMP threads number and affinity policy.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// num_threads_hint is only a hint. When num_threads_hint is zero or negative,
// the function will set the threads number equaling to the number of
// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
// (AFFINITY_NONE) cores according to the policy. The threads number will
// also be truncated to the corresponding cores number when num_threads_hint
// is larger than it.
//
// The OpenMP threads will be bind to (via sched_setaffinity) big cores
// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
// suggested to use AFFINITY_NONE to use all cores.
/// \brief Set OpenMP threads number and affinity policy.
///
/// Caution: this function may hurt performance if improper parameters provided.
/// When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \param status MACE_SUCCESS for successful, or it can't reliabley
/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
/// suggested to use AFFINITY_NONE to use all cores.
/// \return
__attribute__((visibility("default")))
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy);
// Set OpenMP threads number and processor affinity.
//
// Caution: this function may hurt performance if improper parameters provided.
//
// This function may not work well on some chips (e.g. MTK). Setting thread
// affinity to offline cores may run very slow or unexpectedly. In such cases,
// please use SetOpenMPThreadPolicy with default policy instead.
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \param status
/// \return
__attribute__((visibility("default")))
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids);
// Get ARM big.LITTLE configuration.
//
// This function will detect the max frequencies of all CPU cores, and assume
// the cores with largest max frequencies as big cores, and all the remaining
// cores as little. If all cpu core's max frequencies equals, big_core_ids and
// little_core_ids will both be filled with all cpu core ids.
//
// If successful, it returns MACE_SUCCESS and error if it can't reliabley
// detect the frequency of big-LITTLE cores (e.g. MTK).
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__((visibility("default")))
MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
......
......@@ -12,7 +12,72 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
def mace_check(condition, msg):
if not condition:
raise Exception(msg)
def roundup_div4(value):
return int((value + 3) / 4)
class OpenCLBufferType(enum.Enum):
CONV2D_FILTER = 0
IN_OUT_CHANNEL = 1
ARGUMENT = 2
IN_OUT_HEIGHT = 3
IN_OUT_WIDTH = 4
WINOGRAD_FILTER = 5
DW_CONV2D_FILTER = 6
WEIGHT_HEIGHT = 7
WEIGHT_WIDTH = 8
def calculate_image_shape(buffer_type, shape, winograd_blk_size=0):
# keep the same with mace/kernel/opencl/helper.cc
image_shape = [0, 0]
if buffer_type == OpenCLBufferType.CONV2D_FILTER:
mace_check(len(shape) == 4, "Conv2D filter buffer should be 4D")
image_shape[0] = shape[1]
image_shape[1] = shape[2] * shape[3] * roundup_div4(shape[0])
elif buffer_type == OpenCLBufferType.IN_OUT_CHANNEL:
mace_check(len(shape) == 4, "Conv2D input/output buffer should be 4D")
image_shape[0] = roundup_div4(shape[3]) * shape[2]
image_shape[1] = shape[0] * shape[1]
elif buffer_type == OpenCLBufferType.ARGUMENT:
mace_check(len(shape) == 1,
"Argument buffer should be 1D not " + str(shape))
image_shape[0] = roundup_div4(shape[0])
image_shape[1] = 1
elif buffer_type == OpenCLBufferType.IN_OUT_HEIGHT:
mace_check(len(shape) == 4, "Input/output buffer should be 4D")
image_shape[0] = shape[2] * shape[3]
image_shape[1] = shape[0] * roundup_div4(shape[1])
elif buffer_type == OpenCLBufferType.IN_OUT_WIDTH:
mace_check(len(shape) == 4, "Input/output buffer should be 4D")
image_shape[0] = roundup_div4(shape[2]) * shape[3]
image_shape[1] = shape[0] * shape[1]
elif buffer_type == OpenCLBufferType.WINOGRAD_FILTER:
mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
image_shape[0] = roundup_div4(shape[1])
image_shape[1] = (shape[0] * (winograd_blk_size + 2)
* (winograd_blk_size + 2))
elif buffer_type == OpenCLBufferType.DW_CONV2D_FILTER:
mace_check(len(shape) == 4, "Winograd filter buffer should be 4D")
image_shape[0] = shape[0] * shape[2] * shape[3]
image_shape[1] = roundup_div4(shape[1])
elif buffer_type == OpenCLBufferType.WEIGHT_HEIGHT:
mace_check(len(shape) == 4, "Weight buffer should be 4D")
image_shape[0] = shape[1] * shape[2] * shape[3]
image_shape[1] = roundup_div4(shape[0])
elif buffer_type == OpenCLBufferType.WEIGHT_WIDTH:
mace_check(len(shape) == 4, "Weight buffer should be 4D")
image_shape[0] = roundup_div4(shape[1]) * shape[2] * shape[3]
image_shape[1] = shape[0]
else:
mace_check(False, "OpenCL Image do not support type "
+ str(buffer_type))
return image_shape
......@@ -171,6 +171,13 @@ def main(unused_args):
output_graph_def.op.extend(cpu_graph_def.op)
output_graph_def.mem_arena.mem_block.extend(
cpu_graph_def.mem_arena.mem_block)
output_graph_arg_names = set()
for arg in output_graph_def.arg:
output_graph_arg_names.add(arg.name)
for arg in cpu_graph_def.arg:
if arg.name not in output_graph_arg_names:
output_graph_def.arg.extend(arg)
print "Merge done"
else:
option.device = device_type_map[FLAGS.runtime]
......
......@@ -163,6 +163,7 @@ class MaceKeyword(object):
mace_op_data_type_str = 'T'
mace_offset_str = 'offset'
mace_from_caffe_str = 'from_caffe'
mace_opencl_max_image_size = "opencl_max_image_size"
class TransformerRule(Enum):
......
......@@ -28,21 +28,12 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import TransformerRule
from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import mace_check
OPENCL_IMAGE_MAX_SIZE = 16384
from mace.python.tools.convert_util import OpenCLBufferType
class OpenCLBufferType(enum.Enum):
CONV2D_FILTER = 0
IN_OUT_CHANNEL = 1
ARGUMENT = 2
IN_OUT_HEIGHT = 3
IN_OUT_WIDTH = 4
WINOGRAD_FILTER = 5
DW_CONV2D_FILTER = 6
WEIGHT_HEIGHT = 7
WEIGHT_WIDTH = 8
OPENCL_IMAGE_MAX_SIZE = 16384
class Transformer(base_converter.ConverterInterface):
......@@ -101,6 +92,7 @@ class Transformer(base_converter.ConverterInterface):
self._producer = {}
self._target_data_format = DataFormat.NHWC
self._input_output_added = False
self._opencl_max_image_size = [0, 0]
if self._option.device == DeviceType.CPU.value:
self._target_data_format = DataFormat.NCHW
......@@ -972,15 +964,26 @@ class Transformer(base_converter.ConverterInterface):
arg.name = MaceKeyword.mace_mode
arg.i = 0
tensor_shape = list(self._consts[input_name].dims)
if input_type == OpenCLBufferType.WINOGRAD_FILTER:
blk_sqr = op.output_shape[0].dims[0]
wino_blk = int(np.sqrt(blk_sqr)) - 2
wino_arg = op_def.arg.add()
wino_arg.name = MaceKeyword.mace_wino_block_size
wino_arg.i = wino_blk
img_shape = calculate_image_shape(input_type, tensor_shape,
wino_blk)
else:
img_shape = calculate_image_shape(input_type, tensor_shape)
op.input[input_idx] = output_name
# update OpenCL max image size
self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0],
img_shape[0])
self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1],
img_shape[1])
def transform_buffer_image(self):
if self._option.device != DeviceType.GPU.value:
return False
......@@ -1030,6 +1033,11 @@ class Transformer(base_converter.ConverterInterface):
MaceKeyword.mace_activation_type_str).s == ActivationType.PRELU.name: # noqa
self.buffer_to_image(op, 1, OpenCLBufferType.ARGUMENT)
# Add OpenCL max image size
arg = net.arg.add()
arg.name = MaceKeyword.mace_opencl_max_image_size
arg.ints.extend(self._opencl_max_image_size)
for input_node in self._option.input_nodes.values():
new_input_name = MaceKeyword.mace_input_node_name \
+ '_' + input_node.name
......
......@@ -16,6 +16,24 @@ import sys
import operator
from mace.proto import mace_pb2
from mace.python.tools.converter_tool import base_converter as cvt
from mace.python.tools.convert_util import calculate_image_shape
from mace.python.tools.convert_util import OpenCLBufferType
class MemoryBlock(object):
def __init__(self, mem_type, block):
self._mem_type = mem_type
self._block = block
@property
def mem_type(self):
return self._mem_type
@property
def block(self):
return self._block
class MemoryOptimizer(object):
def __init__(self, net_def):
......@@ -24,7 +42,6 @@ class MemoryOptimizer(object):
self.op_mem = {} # op_name->mem_id
self.mem_block = {} # mem_id->[size] or mem_id->[x, y]
self.total_mem_count = 0
self.total_cpu_mem_count = 0
self.input_ref_counter = {}
self.mem_ref_counter = {}
......@@ -52,23 +69,27 @@ class MemoryOptimizer(object):
return True
def get_op_mem_block(self, op_type, output_shape):
return [reduce(operator.mul, output_shape, 1)]
return MemoryBlock(mace_pb2.CPU_BUFFER,
[reduce(operator.mul, output_shape, 1)])
def mem_size(self, memory_block):
return memory_block[0]
return memory_block.block[0]
def sub_mem_block(self, mem_block1, mem_block2):
return self.mem_size(mem_block1) - self.mem_size(mem_block2)
def resize_mem_block(self, old_mem_block, op_mem_block):
return [max(old_mem_block[0], op_mem_block[0])]
return MemoryBlock(
old_mem_block.mem_type,
[max(old_mem_block.block[0], op_mem_block.block[0])])
def add_net_mem_blocks(self):
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0]
block.y = 1
def get_total_origin_mem_size(self):
......@@ -82,7 +103,7 @@ class MemoryOptimizer(object):
def get_total_optimized_mem_size(self):
optimized_mem_size = 0
for mem in self.mem_block:
print mem, self.mem_block[mem]
print mem, self.mem_block[mem].mem_type, self.mem_block[mem].block
optimized_mem_size += self.mem_size(self.mem_block[mem])
return optimized_mem_size
......@@ -117,6 +138,8 @@ class MemoryOptimizer(object):
best_mem_waste_size = sys.maxint
for mid in self.idle_mem:
old_mem_block = self.mem_block[mid]
if old_mem_block.mem_type != op_mem_block.mem_type:
continue
new_mem_block = self.resize_mem_block(
old_mem_block, op_mem_block)
add_mem_size = self.sub_mem_block(new_mem_block,
......@@ -185,53 +208,76 @@ class GPUMemoryOptimizer(MemoryOptimizer):
for arg in op.arg:
if arg.name == 'mode' and arg.i == 0:
return False
elif op.type == 'Shape':
for i in range(len(op.output)):
mem_id = self.total_cpu_mem_count
self.total_cpu_mem_count += 1
op_mem_block = self.get_op_mem_block(
op.type,
op.output_shape[i].dims)
self.mem_block[mem_id] = op_mem_block
return False
return op.type != 'ImageToBuffer'
def get_op_mem_block(self, op_type, output_shape):
mem_block = [0, 0]
if op_type == 'WinogradTransform' or op_type == 'MatMul':
mem_block[0] = output_shape[2]
mem_block[1] = output_shape[0] * int((output_shape[1] + 3) / 4)
buffer_shape = list(output_shape) + [1]
mem_block = MemoryBlock(
mace_pb2.GPU_IMAGE,
calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
buffer_shape))
elif op_type == 'Shape':
mem_block[0] = output_shape[0]
mem_block[1] = 1
mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
[output_shape[0], 1])
else:
if len(output_shape) == 2: # only support fc/softmax
mem_block[0] = int((output_shape[1] + 3) / 4)
mem_block[1] = output_shape[0]
buffer_shape = [output_shape[0], 1, 1, output_shape[1]]
elif len(output_shape) == 4:
mem_block[0] = output_shape[2] * int((output_shape[3] + 3) / 4)
mem_block[1] = output_shape[0] * output_shape[1]
buffer_shape = output_shape
else:
raise Exception('output shape dim size is not 2 or 4.')
mem_block = MemoryBlock(
mace_pb2.GPU_IMAGE,
calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,
buffer_shape))
return mem_block
def mem_size(self, memory_block):
return memory_block[0] * memory_block[1] * 4
if memory_block.mem_type == mace_pb2.GPU_IMAGE:
return memory_block.block[0] * memory_block.block[1] * 4
else:
return memory_block.block[0]
def resize_mem_block(self, old_mem_block, op_mem_block):
resize_mem_block = [
max(old_mem_block[0], op_mem_block[0]),
max(old_mem_block[1], op_mem_block[1])
]
resize_mem_block = MemoryBlock(
old_mem_block.mem_type,
[
max(old_mem_block.block[0], op_mem_block.block[0]),
max(old_mem_block.block[1], op_mem_block.block[1])
])
return resize_mem_block
def add_net_mem_blocks(self):
max_image_size_x = 0
max_image_size_y = 0
for mem in self.mem_block:
arena = self.net_def.mem_arena
block = arena.mem_block.add()
block.mem_id = mem
block.x = self.mem_block[mem][0]
block.y = self.mem_block[mem][1]
block.mem_type = self.mem_block[mem].mem_type
block.x = self.mem_block[mem].block[0]
block.y = self.mem_block[mem].block[1]
if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE:
max_image_size_x = max(max_image_size_x, block.x)
max_image_size_y = max(max_image_size_y, block.y)
# Update OpenCL max image size
net_ocl_max_img_size_arg = None
for arg in self.net_def.arg:
if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size:
net_ocl_max_img_size_arg = arg
max_image_size_x = max(arg.ints[0], max_image_size_x)
max_image_size_y = max(arg.ints[1], max_image_size_y)
break
if net_ocl_max_img_size_arg is None:
net_ocl_max_img_size_arg = self.net_def.arg.add()
net_ocl_max_img_size_arg.name = \
cvt.MaceKeyword.mace_opencl_max_image_size
net_ocl_max_img_size_arg.ints[:] = [max_image_size_x,
max_image_size_y]
def mem_id_base(self):
return 20000
......
......@@ -129,6 +129,7 @@ void CreateMemoryArena(mace::MemoryArena *mem_arena) {
mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block();
mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}});
mem_block{{i}}->set_mem_type(static_cast<MemoryType>({{net.mem_arena.mem_block[i].mem_type}}));
mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}});
mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}});
......
......@@ -244,6 +244,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y);
res[input_names[i]] = mem_id;
......@@ -263,6 +264,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y);
res[output_names[i]] = mem_id;
......
......@@ -245,6 +245,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < input_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(in_mem_block_x);
mem_blk_ptr->set_y(in_mem_block_y);
res[input_names[i]] = mem_id;
......@@ -264,6 +265,7 @@ std::map<std::string, int> AddMemoryOptimization(
for (size_t i = 0; i < output_size; ++i) {
MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block();
mem_blk_ptr->set_mem_id(mem_id);
mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE);
mem_blk_ptr->set_x(out_mem_block_x);
mem_blk_ptr->set_y(out_mem_block_y);
res[output_names[i]] = mem_id;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册