提交 e4860f68 编写于 作者: L Liangliang He

Refactor OpenCL static dependencies

上级 2bee11ed
......@@ -25,7 +25,7 @@ config_setting(
)
config_setting(
name = "is_profiling",
name = "profiling_enabled",
define_values = {
"profiling": "true",
},
......
......@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_profiling")
load("//mace:mace.bzl", "if_android", "if_profiling_enabled")
cc_library(
name = "opencl_runtime",
......@@ -15,76 +15,48 @@ cc_library(
"runtime/opencl/*.cc",
]),
hdrs = glob([
"runtime/opencl/cl.hpp",
"runtime/opencl/cl2.hpp",
"runtime/opencl/*.h",
]),
copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] +
if_profiling_enabled(["-DMACE_OPENCL_PROFILING"]),
linkopts = ["-ldl"],
deps = [
":logging",
":core",
"//mace/utils:logging",
"//mace/utils:tuner",
"@opencl_headers//:opencl20_headers",
],
alwayslink = 1,
)
cc_library(
name = "logging",
srcs = [
"logging.cc",
],
hdrs = [
"logging.h",
],
copts = ["-std=c++11"],
linkopts = if_android([
"-llog",
]),
)
cc_library(
name = "core",
srcs = glob(
["*.cc",],
exclude=[
"logging.cc",
]),
hdrs = glob(
["*.h"],
exclude=[
"logging.h",
]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] + if_android([
"-D__USE_OPENCL",
]),
linkopts = ["-ldl"] + if_android([
"-pie",
]),
srcs = glob(["*.cc"]),
hdrs = glob(["*.h"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android(["-pie"]),
deps = [
":logging",
"//mace/proto:stats_proto",
"//mace/utils",
":opencl_runtime",
"//mace/utils:utils_hdrs",
"//mace/utils:logging",
],
)
# Main program for tests
cc_library(
name = "test_benchmark_main",
testonly = 1,
srcs = glob([
"testing/*.cc",
]),
hdrs = glob([
"testing/*.h",
]),
copts = [
"-std=c++11",
"-D_GLIBCXX_USE_C99_MATH_TR1",
hdrs = [
"testing/test_benchmark.h",
],
srcs = [
"testing/test_benchmark.cc",
"testing/test_benchmark_main.cc",
],
linkopts = ["-lm"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
alwayslink = 1,
deps = [
":core",
"//mace/utils:utils_hdrs",
],
alwayslink = 1,
)
......@@ -3,9 +3,6 @@
//
#include "mace/core/allocator.h"
#ifdef __USE_OPENCL
#include "mace/core/opencl_allocator.h"
#endif
namespace mace {
......@@ -25,8 +22,5 @@ Allocator *GetDeviceAllocator(DeviceType type) {
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
#ifdef __USE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
#endif
} // namespace mace
......@@ -12,7 +12,7 @@
#include <string>
#include <vector>
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
using std::set;
using std::map;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_FUTURE_H_
#define MACE_CORE_FUTURE_H_
#include <functional>
#include "mace/utils/logging.h"
namespace mace {
struct CallStats {
int64_t start_micros;
int64_t end_micros;
};
struct OperatorStats {
std::string operator_name;
std::string type;
CallStats stats;
};
struct RunMetadata {
std::vector<OperatorStats> op_stats;
};
// Wait the call to finish and get the stats if param is not nullptr
struct StatsFuture {
std::function<void(CallStats *)> wait_fn = [](CallStats *) {
LOG(FATAL) << "wait_fn must be properly set";
};
};
} // namespace mace
#endif // MACE_CORE_FUTURE_H_
......@@ -7,7 +7,7 @@
#include <cstdint>
#include <vector>
#include <string>
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
namespace mace {
......
......@@ -4,9 +4,6 @@
#include "mace/core/net.h"
#include "mace/utils/utils.h"
#ifdef __USE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
namespace mace {
......@@ -33,65 +30,51 @@ SimpleNet::SimpleNet(const std::shared_ptr<const NetDef> &net_def,
}
}
}
bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG(1) << "Running net " << name_;
for (auto &op : operators_) {
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
auto &op = *iter;
VLOG(1) << "Running operator " << op->debug_def().name() << "("
<< op->debug_def().type() << ").";
OperatorStats *op_stats = nullptr;
if (run_metadata ) {
if (device_type_ != DeviceType::OPENCL) {
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(NowInMicroSec());
op_stats->set_op_start_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
bool ret;
CallStats call_stats;
if (future_wait) {
StatsFuture future;
ret = op->Run(&future);
if (run_metadata != nullptr) {
future.wait_fn(&call_stats);
} else {
future.wait_fn(nullptr);
}
} else if (run_metadata != nullptr) {
call_stats.start_micros = NowInMicroSec();
ret = op->Run(nullptr);
call_stats.end_micros = NowInMicroSec();
} else {
ret = op->Run(nullptr);
}
if (!op->Run()) {
if (run_metadata != nullptr) {
OperatorStats op_stats = { op->debug_def().name(),
op->debug_def().type(),
call_stats };
run_metadata->op_stats.emplace_back(op_stats);
}
if (!ret) {
LOG(ERROR) << "Operator failed: " << op->debug_def().name();
return false;
}
if (run_metadata) {
if (device_type_ == DeviceType::OPENCL) {
#ifndef __USE_OPENCL
LOG(FATAL) << "OpenCL is not supported";
#else
OpenCLRuntime::Get()->command_queue().finish();
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(
OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000);
op_stats->set_op_start_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_op_end_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
#endif
} else {
op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
}
}
VLOG(1) << "Op " << op->debug_def().name()
<< " has shape: " << internal::MakeString(op->Output(0)->shape());
}
#ifdef __USE_OPENCL
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
#endif
return true;
}
......
......@@ -9,7 +9,6 @@
#include "mace/core/operator.h"
#include "mace/core/workspace.h"
#include "mace/core/mace.h"
#include "mace/proto/stats.pb.h"
namespace mace {
......
......@@ -7,6 +7,7 @@
#include "mace/core/common.h"
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
......@@ -55,7 +56,8 @@ class OperatorBase {
inline const vector<const Tensor *> &Inputs() const { return inputs_; }
inline const vector<Tensor *> &Outputs() { return outputs_; }
virtual bool Run() = 0;
// Run Op asynchronously (depends on device), return a future if not nullptr.
virtual bool Run(StatsFuture *future) = 0;
inline const OperatorDef &debug_def() const {
MACE_CHECK(has_debug_def(), "operator_def was null!");
......@@ -100,7 +102,7 @@ class Operator : public OperatorBase {
}
}
}
virtual bool Run() override = 0;
virtual bool Run(StatsFuture *future) override = 0;
~Operator() noexcept override {}
};
......
因为 它太大了无法显示 source diff 。你可以改为 查看blob
......@@ -5,6 +5,8 @@
#ifndef MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_
#define MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_
// Do not include cl2.hpp directly, include this header instead.
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include "mace/core/runtime/opencl/cl2.hpp"
......
......@@ -3,7 +3,7 @@
//
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
......@@ -37,7 +37,7 @@ OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {}
void *OpenCLAllocator::New(size_t nbytes) {
cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Get()->context(),
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
nbytes, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
......@@ -53,7 +53,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(),
new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
image_shape[0], image_shape[1],
......@@ -79,7 +79,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
auto queue = OpenCLRuntime::Global()->command_queue();
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
......@@ -101,7 +101,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch.resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
&mapped_image_pitch[0],
......@@ -114,12 +114,13 @@ void *OpenCLAllocator::MapImage(void *buffer,
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
auto queue = OpenCLRuntime::Global()->command_queue();
MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
nullptr) == CL_SUCCESS);
}
bool OpenCLAllocator::OnHost() { return false; }
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
} // namespace mace
......@@ -7,15 +7,17 @@
#include <memory>
#include <mutex>
#include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/logging.h"
#include "mace/utils/tuner.h"
#include <CL/opencl.h>
namespace mace {
namespace {
bool ReadFile(const std::string &filename, bool binary,
bool ReadFile(const std::string &filename,
bool binary,
std::vector<unsigned char> *content_ptr) {
MACE_CHECK_NOTNULL(content_ptr);
......@@ -55,7 +57,8 @@ bool ReadFile(const std::string &filename, bool binary,
return true;
}
bool WriteFile(const std::string &filename, bool binary,
bool WriteFile(const std::string &filename,
bool binary,
const std::vector<unsigned char> &content) {
std::ios_base::openmode mode = std::ios::out;
if (binary) {
......@@ -76,124 +79,92 @@ bool WriteFile(const std::string &filename, bool binary,
} // namespace
bool OpenCLRuntime::enable_profiling_ = false;
std::unique_ptr<cl::Event> OpenCLRuntime::profiling_ev_ = nullptr;
void OpenCLProfilingTimer::StartTiming() {}
OpenCLRuntime *OpenCLRuntime::Get() {
static std::once_flag init_once;
static OpenCLRuntime *instance = nullptr;
std::call_once(init_once, []() {
if (!mace::OpenCLLibrary::Supported()) {
LOG(ERROR) << "OpenCL not supported";
return;
}
void OpenCLProfilingTimer::StopTiming() {
OpenCLRuntime::Global()->command_queue().finish();
start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
}
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(ERROR) << "No OpenCL platforms found";
return;
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: "
<< default_platform.getInfo<CL_PLATFORM_NAME>() << ", "
<< default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(ERROR) << "No OpenCL devices found";
return;
}
double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0;
}
bool gpu_detected = false;
cl::Device gpu_device;
for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
gpu_device = device;
gpu_detected = true;
VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
break;
}
}
if (!gpu_detected) {
LOG(ERROR) << "No GPU device found";
return;
}
OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime instance;
return &instance;
}
cl_command_queue_properties properties = 0;
#ifdef __ENABLE_PROFILING
enable_profiling_ = true;
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
#endif
OpenCLRuntime::OpenCLRuntime() {
LoadOpenCLLibrary();
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device, properties);
instance = new OpenCLRuntime(context, gpu_device, command_queue);
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(FATAL) << "No OpenCL platforms found";
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>()
<< ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(FATAL) << "No OpenCL devices found";
}
});
bool gpu_detected = false;
cl::Device gpu_device;
for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
gpu_device = device;
gpu_detected = true;
VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
break;
}
}
if (!gpu_detected) {
LOG(FATAL) << "No GPU device found";
}
return instance;
}
cl_command_queue_properties properties = 0;
void OpenCLRuntime::EnableProfiling() { enable_profiling_ = true; }
#ifdef MACE_OPENCL_PROFILING
properties |= CL_QUEUE_PROFILING_ENABLE;
#endif
cl::Event *OpenCLRuntime::GetDefaultEvent() { return profiling_ev_.get(); }
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device, properties);
cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
}
const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context);
this->command_queue_ = new cl::CommandQueue(command_queue);
}
OpenCLRuntime::OpenCLRuntime(cl::Context context, cl::Device device,
cl::CommandQueue command_queue)
: context_(context), device_(device), command_queue_(command_queue) {
const char *kernel_path = getenv("MACE_KERNEL_PATH");
kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
OpenCLRuntime::~OpenCLRuntime() {
built_program_map_.clear();
delete command_queue_;
delete context_;
delete device_;
UnloadOpenCLLibrary();
}
OpenCLRuntime::~OpenCLRuntime() {}
cl::Context &OpenCLRuntime::context() { return context_; }
cl::Context &OpenCLRuntime::context() { return *context_; }
cl::Device &OpenCLRuntime::device() { return device_; }
cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
cl::Program &OpenCLRuntime::program() {
// TODO(liuqi) : useless, leave it for old code.
return program_;
}
// TODO(heliangliang) Support binary format
const std::map<std::string, std::string> OpenCLRuntime::program_map_ = {
{"addn", "addn.cl"},
{"batch_norm", "batch_norm.cl"},
{"bias_add", "bias_add.cl"},
{"buffer_to_image", "buffer_to_image.cl"},
{"conv_2d", "conv_2d.cl"},
{"conv_2d_1x1", "conv_2d_1x1.cl"},
{"conv_2d_3x3", "conv_2d_3x3.cl"},
{"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
{"pooling", "pooling.cl"},
{"relu", "relu.cl"},
{"concat", "concat.cl"},
{"resize_bilinear", "resize_bilinear.cl"},
{"space_to_batch", "space_to_batch.cl"},
};
std::string
OpenCLRuntime::GenerateCLBinaryFilenamePrefix(const std::string &filename_msg) {
std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix(
const std::string &filename_msg) {
std::string filename_prefix = filename_msg;
for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) {
if (*it == ' ' || *it == '-' || *it == '=') {
......@@ -262,7 +233,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
program_binary_sizes.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[ device_list_size ]);
new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) {
program_binaries[i] = std::unique_ptr<unsigned char[]>(
new unsigned char[program_binary_sizes[i]]);
......@@ -281,16 +252,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
}
}
cl::Kernel
OpenCLRuntime::BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
auto kernel_program_it = program_map_.find(program_name);
if (kernel_program_it == program_map_.end()) {
MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
}
std::string program_file_name = kernel_program_it->second;
cl::Kernel OpenCLRuntime::BuildKernel(
const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
std::string program_file_name = program_name + ".cl";
std::string build_options_str;
for (auto &option : build_options) {
build_options_str += " " + option;
......@@ -312,15 +278,24 @@ OpenCLRuntime::BuildKernel(const std::string &program_name,
return cl::Kernel(program, kernel_name.c_str());
}
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) {
stats->start_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
stats->end_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
}
}
uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
unsigned long long size = 0;
device_.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
return static_cast<uint32_t>(size);
}
uint32_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
unsigned long long size = 0;
kernel.getWorkGroupInfo(device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
return static_cast<uint32_t>(size);
}
......
......@@ -10,36 +10,42 @@
#include <mutex>
#include <set>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
#include "mace/utils/timer.h"
namespace mace {
class OpenCLRuntime {
public:
static OpenCLRuntime *Get();
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
void StartTiming() override;
void StopTiming() override;
double ElapsedMicros() override;
static void EnableProfiling();
cl::Event *GetDefaultEvent();
cl_ulong GetEventProfilingStartInfo();
cl_ulong GetEventProfilingEndInfo();
private:
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
};
class OpenCLRuntime {
public:
static OpenCLRuntime *Global();
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
cl::Program &program();
void GetCallStats(const cl::Event &event, CallStats *stats);
uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
private:
OpenCLRuntime(cl::Context context,
cl::Device device,
cl::CommandQueue command_queue);
OpenCLRuntime();
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime&) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
......@@ -51,19 +57,14 @@ class OpenCLRuntime {
std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg);
private:
static bool enable_profiling_;
static std::unique_ptr<cl::Event> profiling_ev_;
cl::Context context_;
cl::Device device_;
cl::CommandQueue command_queue_;
cl::Program program_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
cl::Context *context_;
cl::Device *device_;
cl::CommandQueue *command_queue_;
std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_;
std::string kernel_path_;
static const std::map<std::string,
std::string> program_map_;
mutable std::map<std::string,
cl::Program> built_program_map_;
};
} // namespace mace
......
......@@ -4,11 +4,10 @@
#include "CL/opencl.h"
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
#include <dlfcn.h>
#include <mutex>
/**
* Wrapper of OpenCL 2.0 (based on 1.2)
......@@ -18,10 +17,8 @@ namespace mace {
namespace {
class OpenCLLibraryImpl final {
public:
static OpenCLLibraryImpl &Get();
bool Load();
void Unload();
bool loaded() { return handle_ != nullptr; }
using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
using clGetPlatformInfoFunc =
......@@ -113,11 +110,8 @@ class OpenCLLibraryImpl final {
const cl_event *,
cl_event *,
cl_int *);
using clCreateCommandQueueWithPropertiesFunc =
cl_command_queue (*)(cl_context /* context */,
cl_device_id /* device */,
const cl_queue_properties * /* properties */,
cl_int * /* errcode_ret */);
using clCreateCommandQueueWithPropertiesFunc = cl_command_queue (*)(
cl_context, cl_device_id, const cl_queue_properties *, cl_int *);
using clReleaseCommandQueueFunc = cl_int (*)(cl_command_queue);
using clCreateProgramWithBinaryFunc = cl_program (*)(cl_context,
cl_uint,
......@@ -161,82 +155,70 @@ class OpenCLLibraryImpl final {
void *,
size_t *);
using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
using clGetImageInfoFunc = cl_int (*)(cl_mem,
cl_image_info,
size_t,
void *,
size_t *);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
DEFINE_FUNC_PTR(clGetPlatformIDs);
DEFINE_FUNC_PTR(clGetPlatformInfo);
DEFINE_FUNC_PTR(clBuildProgram);
DEFINE_FUNC_PTR(clEnqueueNDRangeKernel);
DEFINE_FUNC_PTR(clSetKernelArg);
DEFINE_FUNC_PTR(clReleaseKernel);
DEFINE_FUNC_PTR(clCreateProgramWithSource);
DEFINE_FUNC_PTR(clCreateBuffer);
DEFINE_FUNC_PTR(clCreateImage);
DEFINE_FUNC_PTR(clRetainKernel);
DEFINE_FUNC_PTR(clCreateKernel);
DEFINE_FUNC_PTR(clGetProgramInfo);
DEFINE_FUNC_PTR(clFlush);
DEFINE_FUNC_PTR(clFinish);
DEFINE_FUNC_PTR(clReleaseProgram);
DEFINE_FUNC_PTR(clRetainContext);
DEFINE_FUNC_PTR(clGetContextInfo);
DEFINE_FUNC_PTR(clCreateProgramWithBinary);
DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
DEFINE_FUNC_PTR(clReleaseCommandQueue);
DEFINE_FUNC_PTR(clEnqueueMapBuffer);
DEFINE_FUNC_PTR(clEnqueueMapImage);
DEFINE_FUNC_PTR(clRetainProgram);
DEFINE_FUNC_PTR(clGetProgramBuildInfo);
DEFINE_FUNC_PTR(clEnqueueReadBuffer);
DEFINE_FUNC_PTR(clEnqueueWriteBuffer);
DEFINE_FUNC_PTR(clWaitForEvents);
DEFINE_FUNC_PTR(clReleaseEvent);
DEFINE_FUNC_PTR(clCreateContext);
DEFINE_FUNC_PTR(clCreateContextFromType);
DEFINE_FUNC_PTR(clReleaseContext);
DEFINE_FUNC_PTR(clRetainCommandQueue);
DEFINE_FUNC_PTR(clEnqueueUnmapMemObject);
DEFINE_FUNC_PTR(clRetainMemObject);
DEFINE_FUNC_PTR(clReleaseMemObject);
DEFINE_FUNC_PTR(clGetDeviceInfo);
DEFINE_FUNC_PTR(clGetDeviceIDs);
DEFINE_FUNC_PTR(clRetainDevice);
DEFINE_FUNC_PTR(clReleaseDevice);
DEFINE_FUNC_PTR(clRetainEvent);
DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
DEFINE_FUNC_PTR(clGetEventProfilingInfo);
DEFINE_FUNC_PTR(clGetImageInfo);
#undef DEFINE_FUNC_PTR
cl_profiling_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
using clGetImageInfoFunc =
cl_int (*)(cl_mem, cl_image_info, size_t, void *, size_t *);
#define MACE_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
MACE_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
MACE_CL_DEFINE_FUNC_PTR(clGetPlatformInfo);
MACE_CL_DEFINE_FUNC_PTR(clBuildProgram);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueNDRangeKernel);
MACE_CL_DEFINE_FUNC_PTR(clSetKernelArg);
MACE_CL_DEFINE_FUNC_PTR(clReleaseKernel);
MACE_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
MACE_CL_DEFINE_FUNC_PTR(clCreateBuffer);
MACE_CL_DEFINE_FUNC_PTR(clCreateImage);
MACE_CL_DEFINE_FUNC_PTR(clRetainKernel);
MACE_CL_DEFINE_FUNC_PTR(clCreateKernel);
MACE_CL_DEFINE_FUNC_PTR(clGetProgramInfo);
MACE_CL_DEFINE_FUNC_PTR(clFlush);
MACE_CL_DEFINE_FUNC_PTR(clFinish);
MACE_CL_DEFINE_FUNC_PTR(clReleaseProgram);
MACE_CL_DEFINE_FUNC_PTR(clRetainContext);
MACE_CL_DEFINE_FUNC_PTR(clGetContextInfo);
MACE_CL_DEFINE_FUNC_PTR(clCreateProgramWithBinary);
MACE_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
MACE_CL_DEFINE_FUNC_PTR(clReleaseCommandQueue);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueMapBuffer);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueMapImage);
MACE_CL_DEFINE_FUNC_PTR(clRetainProgram);
MACE_CL_DEFINE_FUNC_PTR(clGetProgramBuildInfo);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueReadBuffer);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueWriteBuffer);
MACE_CL_DEFINE_FUNC_PTR(clWaitForEvents);
MACE_CL_DEFINE_FUNC_PTR(clReleaseEvent);
MACE_CL_DEFINE_FUNC_PTR(clCreateContext);
MACE_CL_DEFINE_FUNC_PTR(clCreateContextFromType);
MACE_CL_DEFINE_FUNC_PTR(clReleaseContext);
MACE_CL_DEFINE_FUNC_PTR(clRetainCommandQueue);
MACE_CL_DEFINE_FUNC_PTR(clEnqueueUnmapMemObject);
MACE_CL_DEFINE_FUNC_PTR(clRetainMemObject);
MACE_CL_DEFINE_FUNC_PTR(clReleaseMemObject);
MACE_CL_DEFINE_FUNC_PTR(clGetDeviceInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetDeviceIDs);
MACE_CL_DEFINE_FUNC_PTR(clRetainDevice);
MACE_CL_DEFINE_FUNC_PTR(clReleaseDevice);
MACE_CL_DEFINE_FUNC_PTR(clRetainEvent);
MACE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo);
#undef MACE_CL_DEFINE_FUNC_PTR
private:
void *LoadFromPath(const std::string &path);
void *handle_ = nullptr;
};
OpenCLLibraryImpl &OpenCLLibraryImpl::Get() {
static std::once_flag load_once;
static OpenCLLibraryImpl instance;
std::call_once(load_once, []() { instance.Load(); });
return instance;
}
bool OpenCLLibraryImpl::Load() {
if (loaded()) return true;
if (handle_ != nullptr) { return true; }
// TODO(heliangliang) Make this configurable
// TODO(heliangliang) Benchmark 64 bit overhead
static const std::vector<std::string> paths = {
const std::vector<std::string> paths = {
"libOpenCL.so",
#if defined(__aarch64__)
// Qualcomm Adreno
......@@ -260,12 +242,16 @@ bool OpenCLLibraryImpl::Load() {
void *handle = LoadFromPath(path);
if (handle != nullptr) {
handle_ = handle;
return true;
break;
}
}
LOG(ERROR) << "Failed to load OpenCL library";
return false;
if (handle_ == nullptr) {
LOG(ERROR) << "Failed to load OpenCL library";
return false;
}
return true;
}
void OpenCLLibraryImpl::Unload() {
......@@ -286,7 +272,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
return nullptr;
}
#define ASSIGN_FROM_DLSYM(func) \
#define MACE_CL_ASSIGN_FROM_DLSYM(func) \
do { \
void *ptr = dlsym(handle, #func); \
if (ptr == nullptr) { \
......@@ -298,86 +284,91 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
VLOG(2) << "Loaded " << #func << " from " << path; \
} while (false)
ASSIGN_FROM_DLSYM(clGetPlatformIDs);
ASSIGN_FROM_DLSYM(clGetPlatformInfo);
ASSIGN_FROM_DLSYM(clBuildProgram);
ASSIGN_FROM_DLSYM(clEnqueueNDRangeKernel);
ASSIGN_FROM_DLSYM(clSetKernelArg);
ASSIGN_FROM_DLSYM(clReleaseKernel);
ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
ASSIGN_FROM_DLSYM(clCreateBuffer);
ASSIGN_FROM_DLSYM(clCreateImage);
ASSIGN_FROM_DLSYM(clRetainKernel);
ASSIGN_FROM_DLSYM(clCreateKernel);
ASSIGN_FROM_DLSYM(clGetProgramInfo);
ASSIGN_FROM_DLSYM(clFlush);
ASSIGN_FROM_DLSYM(clFinish);
ASSIGN_FROM_DLSYM(clReleaseProgram);
ASSIGN_FROM_DLSYM(clRetainContext);
ASSIGN_FROM_DLSYM(clGetContextInfo);
ASSIGN_FROM_DLSYM(clCreateProgramWithBinary);
ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
ASSIGN_FROM_DLSYM(clEnqueueMapImage);
ASSIGN_FROM_DLSYM(clRetainProgram);
ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
ASSIGN_FROM_DLSYM(clEnqueueWriteBuffer);
ASSIGN_FROM_DLSYM(clWaitForEvents);
ASSIGN_FROM_DLSYM(clReleaseEvent);
ASSIGN_FROM_DLSYM(clCreateContext);
ASSIGN_FROM_DLSYM(clCreateContextFromType);
ASSIGN_FROM_DLSYM(clReleaseContext);
ASSIGN_FROM_DLSYM(clRetainCommandQueue);
ASSIGN_FROM_DLSYM(clEnqueueUnmapMemObject);
ASSIGN_FROM_DLSYM(clRetainMemObject);
ASSIGN_FROM_DLSYM(clReleaseMemObject);
ASSIGN_FROM_DLSYM(clGetDeviceInfo);
ASSIGN_FROM_DLSYM(clGetDeviceIDs);
ASSIGN_FROM_DLSYM(clRetainDevice);
ASSIGN_FROM_DLSYM(clReleaseDevice);
ASSIGN_FROM_DLSYM(clRetainEvent);
ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
ASSIGN_FROM_DLSYM(clGetImageInfo);
#undef ASSIGN_FROM_DLSYM
MACE_CL_ASSIGN_FROM_DLSYM(clGetPlatformIDs);
MACE_CL_ASSIGN_FROM_DLSYM(clGetPlatformInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clBuildProgram);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueNDRangeKernel);
MACE_CL_ASSIGN_FROM_DLSYM(clSetKernelArg);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseKernel);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateBuffer);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateImage);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainKernel);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateKernel);
MACE_CL_ASSIGN_FROM_DLSYM(clGetProgramInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clFlush);
MACE_CL_ASSIGN_FROM_DLSYM(clFinish);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseProgram);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainContext);
MACE_CL_ASSIGN_FROM_DLSYM(clGetContextInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateProgramWithBinary);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueMapImage);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainProgram);
MACE_CL_ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueWriteBuffer);
MACE_CL_ASSIGN_FROM_DLSYM(clWaitForEvents);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseEvent);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateContext);
MACE_CL_ASSIGN_FROM_DLSYM(clCreateContextFromType);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseContext);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainCommandQueue);
MACE_CL_ASSIGN_FROM_DLSYM(clEnqueueUnmapMemObject);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainMemObject);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseMemObject);
MACE_CL_ASSIGN_FROM_DLSYM(clGetDeviceInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetDeviceIDs);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainDevice);
MACE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice);
MACE_CL_ASSIGN_FROM_DLSYM(clRetainEvent);
MACE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
#undef MACE_CL_ASSIGN_FROM_DLSYM
return handle;
}
} // namespace
bool OpenCLLibrary::Supported() { return OpenCLLibraryImpl::Get().loaded(); }
OpenCLLibraryImpl *openclLibraryImpl = nullptr;
} // namespace
void OpenCLLibrary::Load() { OpenCLLibraryImpl::Get().Load(); }
void LoadOpenCLLibrary() {
if (openclLibraryImpl == nullptr) {
openclLibraryImpl = new OpenCLLibraryImpl();
MACE_CHECK(openclLibraryImpl->Load());
}
}
void OpenCLLibrary::Unload() { OpenCLLibraryImpl::Get().Unload(); }
void UnloadOpenCLLibrary() {
openclLibraryImpl->Unload();
delete openclLibraryImpl;
openclLibraryImpl = nullptr;
}
} // namespace mace
cl_int clGetPlatformIDs(cl_uint num_entries,
cl_platform_id *platforms,
cl_uint *num_platforms) {
auto func = mace::OpenCLLibraryImpl::Get().clGetPlatformIDs;
if (func != nullptr) {
return func(num_entries, platforms, num_platforms);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetPlatformIDs;
MACE_CHECK_NOTNULL(func);
return func(num_entries, platforms, num_platforms);
}
cl_int clGetPlatformInfo(cl_platform_id platform,
cl_platform_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetPlatformInfo;
if (func != nullptr) {
return func(platform, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetPlatformInfo;
MACE_CHECK_NOTNULL(func);
return func(platform, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clBuildProgram(cl_program program,
......@@ -387,13 +378,11 @@ cl_int clBuildProgram(cl_program program,
void(CL_CALLBACK *pfn_notify)(cl_program program,
void *user_data),
void *user_data) {
auto func = mace::OpenCLLibraryImpl::Get().clBuildProgram;
if (func != nullptr) {
return func(program, num_devices, device_list, options, pfn_notify,
user_data);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clBuildProgram;
MACE_CHECK_NOTNULL(func);
return func(program, num_devices, device_list, options, pfn_notify,
user_data);
}
cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
......@@ -405,44 +394,36 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueNDRangeKernel;
if (func != nullptr) {
return func(command_queue, kernel, work_dim, global_work_offset,
global_work_size, local_work_size, num_events_in_wait_list,
event_wait_list, event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueNDRangeKernel;
MACE_CHECK_NOTNULL(func);
return func(command_queue, kernel, work_dim, global_work_offset,
global_work_size, local_work_size, num_events_in_wait_list,
event_wait_list, event);
}
cl_int clSetKernelArg(cl_kernel kernel,
cl_uint arg_index,
size_t arg_size,
const void *arg_value) {
auto func = mace::OpenCLLibraryImpl::Get().clSetKernelArg;
if (func != nullptr) {
return func(kernel, arg_index, arg_size, arg_value);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clSetKernelArg;
MACE_CHECK_NOTNULL(func);
return func(kernel, arg_index, arg_size, arg_value);
}
cl_int clRetainMemObject(cl_mem memobj) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainMemObject;
if (func != nullptr) {
return func(memobj);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainMemObject;
MACE_CHECK_NOTNULL(func);
return func(memobj);
}
cl_int clReleaseMemObject(cl_mem memobj) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseMemObject;
if (func != nullptr) {
return func(memobj);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseMemObject;
MACE_CHECK_NOTNULL(func);
return func(memobj);
}
cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
......@@ -451,23 +432,20 @@ cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueUnmapMemObject;
if (func != nullptr) {
return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
event_wait_list, event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueUnmapMemObject;
MACE_CHECK_NOTNULL(func);
return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
event_wait_list, event);
}
cl_int clRetainCommandQueue(cl_command_queue command_queue) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainCommandQueue;
if (func != nullptr) {
return func(command_queue);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainCommandQueue;
MACE_CHECK_NOTNULL(func);
return func(command_queue);
}
cl_context clCreateContext(
const cl_context_properties *properties,
cl_uint num_devices,
......@@ -475,53 +453,44 @@ cl_context clCreateContext(
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateContext;
if (func != nullptr) {
return func(properties, num_devices, devices, pfn_notify, user_data,
errcode_ret);
} else {
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateContext;
MACE_CHECK_NOTNULL(func);
return func(properties, num_devices, devices, pfn_notify, user_data,
errcode_ret);
}
cl_context clCreateContextFromType(
const cl_context_properties *properties,
cl_device_type device_type,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateContextFromType;
if (func != nullptr) {
return func(properties, device_type, pfn_notify, user_data, errcode_ret);
} else {
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateContextFromType;
MACE_CHECK_NOTNULL(func);
return func(properties, device_type, pfn_notify, user_data, errcode_ret);
}
cl_int clReleaseContext(cl_context context) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseContext;
if (func != nullptr) {
return func(context);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseContext;
MACE_CHECK_NOTNULL(func);
return func(context);
}
cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) {
auto func = mace::OpenCLLibraryImpl::Get().clWaitForEvents;
if (func != nullptr) {
return func(num_events, event_list);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clWaitForEvents;
MACE_CHECK_NOTNULL(func);
return func(num_events, event_list);
}
cl_int clReleaseEvent(cl_event event) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseEvent;
if (func != nullptr) {
return func(event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseEvent;
MACE_CHECK_NOTNULL(func);
return func(event);
}
cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
......@@ -533,13 +502,11 @@ cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueWriteBuffer;
if (func != nullptr) {
return func(command_queue, buffer, blocking_write, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueWriteBuffer;
MACE_CHECK_NOTNULL(func);
return func(command_queue, buffer, blocking_write, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event);
}
cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
......@@ -551,13 +518,11 @@ cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueReadBuffer;
if (func != nullptr) {
return func(command_queue, buffer, blocking_read, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueReadBuffer;
MACE_CHECK_NOTNULL(func);
return func(command_queue, buffer, blocking_read, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event);
}
cl_int clGetProgramBuildInfo(cl_program program,
......@@ -566,22 +531,18 @@ cl_int clGetProgramBuildInfo(cl_program program,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetProgramBuildInfo;
if (func != nullptr) {
return func(program, device, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetProgramBuildInfo;
MACE_CHECK_NOTNULL(func);
return func(program, device, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clRetainProgram(cl_program program) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainProgram;
if (func != nullptr) {
return func(program);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainProgram;
MACE_CHECK_NOTNULL(func);
return func(program);
}
void *clEnqueueMapBuffer(cl_command_queue command_queue,
......@@ -594,16 +555,11 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue,
const cl_event *event_wait_list,
cl_event *event,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapBuffer;
if (func != nullptr) {
return func(command_queue, buffer, blocking_map, map_flags, offset, size,
num_events_in_wait_list, event_wait_list, event, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueMapBuffer;
MACE_CHECK_NOTNULL(func);
return func(command_queue, buffer, blocking_map, map_flags, offset, size,
num_events_in_wait_list, event_wait_list, event, errcode_ret);
}
void *clEnqueueMapImage(cl_command_queue command_queue,
......@@ -618,38 +574,30 @@ void *clEnqueueMapImage(cl_command_queue command_queue,
const cl_event *event_wait_list,
cl_event *event,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapImage;
if (func != nullptr) {
return func(command_queue, image, blocking_map, map_flags, origin, region,
image_row_pitch, image_slice_pitch,
num_events_in_wait_list, event_wait_list, event, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clEnqueueMapImage;
MACE_CHECK_NOTNULL(func);
return func(command_queue, image, blocking_map, map_flags, origin, region,
image_row_pitch, image_slice_pitch, num_events_in_wait_list,
event_wait_list, event, errcode_ret);
}
cl_command_queue clCreateCommandQueueWithProperties(
cl_context context,
cl_device_id device,
const cl_queue_properties *properties,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateCommandQueueWithProperties;
if (func != nullptr) {
return func(context, device, properties, errcode_ret);
} else {
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateCommandQueueWithProperties;
MACE_CHECK_NOTNULL(func);
return func(context, device, properties, errcode_ret);
}
cl_int clReleaseCommandQueue(cl_command_queue command_queue) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseCommandQueue;
if (func != nullptr) {
return func(command_queue);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseCommandQueue;
MACE_CHECK_NOTNULL(func);
return func(command_queue);
}
cl_program clCreateProgramWithBinary(cl_context context,
......@@ -659,25 +607,18 @@ cl_program clCreateProgramWithBinary(cl_context context,
const unsigned char **binaries,
cl_int *binary_status,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateProgramWithBinary;
if (func != nullptr) {
return func(context, num_devices, device_list, lengths, binaries,
binary_status, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateProgramWithBinary;
MACE_CHECK_NOTNULL(func);
return func(context, num_devices, device_list, lengths, binaries,
binary_status, errcode_ret);
}
cl_int clRetainContext(cl_context context) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainContext;
if (func != nullptr) {
return func(context);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainContext;
MACE_CHECK_NOTNULL(func);
return func(context);
}
cl_int clGetContextInfo(cl_context context,
......@@ -685,40 +626,32 @@ cl_int clGetContextInfo(cl_context context,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetContextInfo;
if (func != nullptr) {
return func(context, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetContextInfo;
MACE_CHECK_NOTNULL(func);
return func(context, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clReleaseProgram(cl_program program) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseProgram;
if (func != nullptr) {
return func(program);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseProgram;
MACE_CHECK_NOTNULL(func);
return func(program);
}
cl_int clFlush(cl_command_queue command_queue) {
auto func = mace::OpenCLLibraryImpl::Get().clFlush;
if (func != nullptr) {
return func(command_queue);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clFlush;
MACE_CHECK_NOTNULL(func);
return func(command_queue);
}
cl_int clFinish(cl_command_queue command_queue) {
auto func = mace::OpenCLLibraryImpl::Get().clFinish;
if (func != nullptr) {
return func(command_queue);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clFinish;
MACE_CHECK_NOTNULL(func);
return func(command_queue);
}
cl_int clGetProgramInfo(cl_program program,
......@@ -726,36 +659,27 @@ cl_int clGetProgramInfo(cl_program program,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetProgramInfo;
if (func != nullptr) {
return func(program, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetProgramInfo;
MACE_CHECK_NOTNULL(func);
return func(program, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_kernel clCreateKernel(cl_program program,
const char *kernel_name,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateKernel;
if (func != nullptr) {
return func(program, kernel_name, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateKernel;
MACE_CHECK_NOTNULL(func);
return func(program, kernel_name, errcode_ret);
}
cl_int clRetainKernel(cl_kernel kernel) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainKernel;
if (func != nullptr) {
return func(kernel);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainKernel;
MACE_CHECK_NOTNULL(func);
return func(kernel);
}
cl_mem clCreateBuffer(cl_context context,
......@@ -763,15 +687,10 @@ cl_mem clCreateBuffer(cl_context context,
size_t size,
void *host_ptr,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateBuffer;
if (func != nullptr) {
return func(context, flags, size, host_ptr, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateBuffer;
MACE_CHECK_NOTNULL(func);
return func(context, flags, size, host_ptr, errcode_ret);
}
cl_mem clCreateImage(cl_context context,
......@@ -780,16 +699,10 @@ cl_mem clCreateImage(cl_context context,
const cl_image_desc *image_desc,
void *host_ptr,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateImage;
if (func != nullptr) {
return func(context, flags, image_format, image_desc, host_ptr,
errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateImage;
MACE_CHECK_NOTNULL(func);
return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
}
cl_program clCreateProgramWithSource(cl_context context,
......@@ -797,24 +710,17 @@ cl_program clCreateProgramWithSource(cl_context context,
const char **strings,
const size_t *lengths,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clCreateProgramWithSource;
if (func != nullptr) {
return func(context, count, strings, lengths, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clCreateProgramWithSource;
MACE_CHECK_NOTNULL(func);
return func(context, count, strings, lengths, errcode_ret);
}
cl_int clReleaseKernel(cl_kernel kernel) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseKernel;
if (func != nullptr) {
return func(kernel);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseKernel;
MACE_CHECK_NOTNULL(func);
return func(kernel);
}
cl_int clGetDeviceIDs(cl_platform_id platform,
......@@ -822,12 +728,10 @@ cl_int clGetDeviceIDs(cl_platform_id platform,
cl_uint num_entries,
cl_device_id *devices,
cl_uint *num_devices) {
auto func = mace::OpenCLLibraryImpl::Get().clGetDeviceIDs;
if (func != nullptr) {
return func(platform, device_type, num_entries, devices, num_devices);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetDeviceIDs;
MACE_CHECK_NOTNULL(func);
return func(platform, device_type, num_entries, devices, num_devices);
}
cl_int clGetDeviceInfo(cl_device_id device,
......@@ -835,40 +739,32 @@ cl_int clGetDeviceInfo(cl_device_id device,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetDeviceInfo;
if (func != nullptr) {
return func(device, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetDeviceInfo;
MACE_CHECK_NOTNULL(func);
return func(device, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clRetainDevice(cl_device_id device) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainDevice;
if (func != nullptr) {
return func(device);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainDevice;
MACE_CHECK_NOTNULL(func);
return func(device);
}
cl_int clReleaseDevice(cl_device_id device) {
auto func = mace::OpenCLLibraryImpl::Get().clReleaseDevice;
if (func != nullptr) {
return func(device);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clReleaseDevice;
MACE_CHECK_NOTNULL(func);
return func(device);
}
cl_int clRetainEvent(cl_event event) {
auto func = mace::OpenCLLibraryImpl::Get().clRetainEvent;
if (func != nullptr) {
return func(event);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clRetainEvent;
MACE_CHECK_NOTNULL(func);
return func(event);
}
cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
......@@ -877,13 +773,11 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo;
if (func != nullptr) {
return func(kernel, device, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetKernelWorkGroupInfo;
MACE_CHECK_NOTNULL(func);
return func(kernel, device, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clGetEventProfilingInfo(cl_event event,
......@@ -891,13 +785,11 @@ cl_int clGetEventProfilingInfo(cl_event event,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo;
if (func != nullptr) {
return func(event, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetEventProfilingInfo;
MACE_CHECK_NOTNULL(func);
return func(event, param_name, param_value_size, param_value,
param_value_size_ret);
}
cl_int clGetImageInfo(cl_mem image,
......@@ -905,12 +797,9 @@ cl_int clGetImageInfo(cl_mem image,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetImageInfo;
if (func != nullptr) {
return func(image, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
MACE_CHECK_NOTNULL(mace::openclLibraryImpl);
auto func = mace::openclLibraryImpl->clGetImageInfo;
MACE_CHECK_NOTNULL(func);
return func(image, param_name, param_value_size, param_value,
param_value_size_ret);
}
......@@ -7,13 +7,10 @@
namespace mace {
class OpenCLLibrary {
public:
static bool Supported();
static void Load();
static void Unload();
};
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
......@@ -7,7 +7,7 @@
#include "mace/core/allocator.h"
#include "mace/core/common.h"
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include "mace/core/types.h"
#include "mace/core/mace.h"
......
......@@ -9,8 +9,8 @@
#include <regex>
#include <vector>
#include "mace/core/logging.h"
#include "mace/core/testing/env_time.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
#include "mace/core/testing/test_benchmark.h"
namespace mace {
......@@ -82,7 +82,7 @@ void Benchmark::Run(const char *pattern) {
}
printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
printf("%s\n", string(width + 22, '-').c_str());
printf("%s\n", std::string(width + 22, '-').c_str());
for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue;
for (auto arg : b->args_) {
......@@ -128,7 +128,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
int64_t iters = kMinIters;
while (true) {
accum_time = 0;
start_time = NowMicros();
start_time = utils::NowMicros();
bytes_processed = -1;
items_processed = -1;
label.clear();
......@@ -160,11 +160,11 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
void BytesProcessed(int64_t n) { bytes_processed = n; }
void ItemsProcessed(int64_t n) { items_processed = n; }
void StartTiming() {
if (start_time == 0) start_time = NowMicros();
if (start_time == 0) start_time = utils::NowMicros();
}
void StopTiming() {
if (start_time != 0) {
accum_time += (NowMicros() - start_time);
accum_time += (utils::NowMicros() - start_time);
start_time = 0;
}
}
......
......@@ -3,13 +3,12 @@
//
// Simple benchmarking facility.
#ifndef MACE_TEST_BENCHMARK_H_
#define MACE_TEST_BENCHMARK_H_
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <utility>
#include <vector>
#include "mace/core/types.h"
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
......@@ -31,7 +30,7 @@ class Benchmark {
static void Run(const char *pattern);
private:
string name_;
std::string name_;
int num_args_;
std::vector<std::pair<int, int>> args_;
void (*fn0_)(int) = nullptr;
......@@ -51,4 +50,4 @@ void StopTiming();
} // namespace testing
} // namespace mace
#endif // MACE_TEST_BENCHMARK_H_
#endif // MACE_CORE_TESTING_TEST_BENCHMARK_H_
......@@ -335,4 +335,4 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
return true;
}
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -3,7 +3,8 @@
//
#include "mace/dsp/hexagon_control_wrapper.h"
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include "mace/utils/env_time.h"
#include "gtest/gtest.h"
using namespace mace;
......@@ -27,17 +28,14 @@ TEST(HexagonControlerWrapper, InputFloat) {
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int64_t start_micros = utils::NowMicros();
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
int64_t end_micros = utils::NowMicros();
VLOG(0) << "avg duration: " << (end_micros - start_micros) / (double)round
<< " ms";
wrapper.GetPerfInfo();
wrapper.PrintLog();
......@@ -95,4 +93,4 @@ TEST(HexagonControlerWrapper, PreQuantize) {
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
\ No newline at end of file
}
......@@ -5,7 +5,7 @@
#ifndef MACE_HEXAGON_NN_OPS_H_
#define MACE_HEXAGON_NN_OPS_H_
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include <unordered_map>
namespace mace {
......
......@@ -30,7 +30,7 @@ cc_test(
name = "util_test",
testonly = 1,
srcs = glob(["*_test.cc"]),
copts = ["-std=c++11"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([
"-ldl",
"-lm",
......
......@@ -6,11 +6,12 @@ cc_binary(
srcs = [
"helloworld.cc",
],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
deps = [
"//mace/core",
"//mace/ops",
"//mace/core:opencl_runtime",
],
)
......@@ -18,8 +19,8 @@ cc_test(
name = "benchmark_example",
testonly = 1,
srcs = ["benchmark_example.cc"],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
"//mace/core",
......@@ -30,8 +31,8 @@ cc_test(
cc_binary(
name = "mace_run",
srcs = glob(["models/*/*.cc"] + ["mace_run.cc"]),
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1", "-v", "-ftime-report"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
"//mace/core",
......
......@@ -3,6 +3,7 @@
//
#include "mace/core/net.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
using namespace mace;
......@@ -42,16 +43,10 @@ int main() {
net_def.add_op()->CopyFrom(op_def_1);
net_def.add_op()->CopyFrom(op_def_2);
auto input = net_def.add_tensors();
input->set_name("Input");
input->set_data_type(DataType::DT_FLOAT);
input->add_dims(2);
input->add_dims(3);
for (int i = 0; i < 6; ++i) {
input->add_float_data(i - 3);
}
VLOG(0) << net_def.DebugString();
alignas(4) unsigned char tensor_data[] = "012345678901234567890123";
const std::vector<int64_t> dims = {1, 2, 3, 1};
TensorProto input("Input", tensor_data, dims, DataType::DT_FLOAT);
net_def.mutable_tensors().push_back(input);
// Create workspace and input tensor
Workspace ws;
......
......@@ -25,8 +25,8 @@ cc_library(
linkopts = if_android(["-lm"]),
deps = [
"//mace/core",
"//mace/utils",
"//mace/utils:tuner",
"//mace/core:opencl_runtime",
"//mace/utils:utils_hdrs",
],
)
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -15,7 +16,7 @@ struct AddNFunctorBase {};
template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
Tensor *output_tensor, StatsFuture *future) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size();
......@@ -38,12 +39,14 @@ struct AddNFunctor : AddNFunctorBase {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future);
template <typename T>
struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
Tensor *output_tensor, StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BATCH_NORM_H_
#define MACE_KERNELS_BATCH_NORM_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -20,7 +21,8 @@ struct BatchNormFunctor {
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
......@@ -80,7 +82,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> {
......@@ -91,7 +94,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -15,7 +16,8 @@ template <DeviceType D, typename T>
struct BiasAddFunctor {
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -51,14 +53,16 @@ template <>
void BiasAddFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
*/
template <typename T>
struct BiasAddFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
......@@ -22,7 +23,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
bool i2b_;
......@@ -34,7 +36,8 @@ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -15,7 +16,8 @@ class ChannelShuffleFunctor {
public:
ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape, T *output) {
void operator()(const T *input, const index_t *input_shape,
T *output, StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -44,4 +46,4 @@ class ChannelShuffleFunctor {
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CHANNEL_SHUFFLE_H_
\ No newline at end of file
#endif // MACE_KERNELS_CHANNEL_SHUFFLE_H_
......@@ -6,6 +6,7 @@
#define MACE_KERNELS_CONCAT_H_
#include "mace/core/common.h"
#include "mace/core/future.h"
#include "mace/core/types.h"
#include "mace/core/mace.h"
#include "mace/core/tensor.h"
......@@ -24,7 +25,8 @@ struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const Tensor *input0 = input_list.front();
const int inputs_count = input_list.size();
......@@ -78,7 +80,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output);
Tensor *output, StatsFuture *future);
};
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
......@@ -32,7 +33,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
......@@ -130,7 +132,8 @@ template<>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
template<typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
......@@ -142,7 +145,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
#define MACE_KERNELS_DEPTHWISE_CONV_H_
#include "mace/core/future.h"
#include "mace/core/common.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/mace.h"
......@@ -23,7 +24,8 @@ struct DepthwiseConv2dFunctor {
void operator()(const Tensor *input, // NCHW
const Tensor *filter, // c_out, c_in, kernel_h, kernel_w
const Tensor *bias, // c_out
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(bias);
......@@ -115,14 +117,16 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -33,8 +33,10 @@ struct FusedConv2dFunctor : FusedConv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
Tensor *output,
StatsFuture *future) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias,
output, future);
T *output_data = output->mutable_data<T>();
T zero_value;
......@@ -62,7 +64,8 @@ struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_GLOBAL_AVG_POOLING_H_
#define MACE_KERNELS_GLOBAL_AVG_POOLING_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -12,7 +13,10 @@ namespace kernels {
template <DeviceType D, typename T>
struct GlobalAvgPoolingFunctor {
void operator()(const T *input, const index_t *input_shape, T *output) {
void operator()(const T *input,
const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -35,9 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, float *output);
const float *input, const index_t *input_shape,
float *output, StatsFuture *future);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
\ No newline at end of file
#endif // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor,
StatsFuture *future) {
// TODO: neon mem copy
index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>();
......
......@@ -15,7 +15,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X +
......
......@@ -44,7 +44,8 @@ template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
......@@ -79,7 +80,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......
......@@ -29,7 +29,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
......@@ -53,7 +54,7 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......@@ -77,4 +78,4 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, float *output) {
const float *input, const index_t *input_shape,
float *output, StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -52,4 +53,4 @@ void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -56,7 +56,8 @@ extern void PoolingAvgNeonK3x3S2x2Padded(const float *input,
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
......@@ -122,9 +123,9 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
} else { // not implement yet
PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
padding_, dilations_)(
input_tensor, output_tensor);
input_tensor, output_tensor, future);
}
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
index_t size = input_tensor->size();
......@@ -66,4 +67,4 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -13,7 +13,7 @@ namespace kernels {
template <typename T>
static void AddN(const std::vector<const Tensor *> &input_tensors,
Tensor *output) {
Tensor *output, StatsFuture *future) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
......@@ -26,7 +26,7 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -61,12 +61,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
{1, kwg_size}
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -77,16 +78,25 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template <typename T>
void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
......@@ -108,7 +118,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
AddN<T>(input_tensors, output_tensor, future);
};
template
......
......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
......@@ -18,8 +19,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -27,7 +28,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -72,12 +73,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -88,10 +90,18 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -15,8 +15,8 @@ template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -28,7 +28,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -43,12 +43,19 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
bias_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bias_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -12,7 +12,8 @@ namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image) {
Tensor *image,
StatsFuture *future) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape;
if (!i2b_) {
......@@ -31,7 +32,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
string kernel_name;
switch (type) {
case FILTER:
......@@ -64,12 +65,20 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
const std::vector<uint32_t> lws = {kwg_size, 1, 1};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
......
......@@ -14,7 +14,8 @@ namespace kernels {
static void Concat2(const Tensor *input0,
const Tensor *input1,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -22,7 +23,7 @@ static void Concat2(const Tensor *input0,
const int channel_blk = RoundUpDiv4(channel);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
......@@ -73,12 +74,13 @@ static void Concat2(const Tensor *input0,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -89,15 +91,24 @@ static void Concat2(const Tensor *input0,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template<typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count == 2 && axis_ == 3)
<< "Concat opencl kernel only support two elements with axis == 3";
......@@ -124,7 +135,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
switch (inputs_count) {
case 2:
Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value, output);
Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future);
break;
default:MACE_NOT_IMPLEMENTED;
}
......
......@@ -11,37 +11,40 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output);
const DataType dt, Tensor *output,
StatsFuture *future);
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
......@@ -74,9 +77,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
conv2d_func(input, filter, bias, false, paddings.data(),
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
Conv2dOpencl(input, filter, bias, false, strides_[0],
paddings.data(), DataTypeToEnum<T>::value,
output, future);
}
}
......
......@@ -18,7 +18,8 @@ void Conv1x1(const Tensor *input,
const bool fused_relu,
const int stride,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -45,9 +46,7 @@ void Conv1x1(const Tensor *input,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
uint32_t idx = 0;
......@@ -92,12 +91,13 @@ void Conv1x1(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -108,11 +108,18 @@ void Conv1x1(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
extern void Conv2dOpenclK1x1S1(const Tensor *input,
......@@ -121,8 +128,9 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
Tensor *output,
StatsFuture *future) {
Conv1x1(input, filter, bias, fused_relu, 1, dt, output, future);
};
extern void Conv2dOpenclK1x1S2(const Tensor *input,
......@@ -131,8 +139,9 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
Tensor *output,
StatsFuture *future) {
Conv1x1(input, filter, bias, fused_relu, 2, dt, output, future);
};
} // namespace kernels
......
......@@ -15,7 +15,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const DataType dt, Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -35,9 +36,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
uint32_t idx = 0;
......@@ -84,12 +83,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -100,11 +100,19 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
......@@ -112,8 +120,9 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
};
void Conv2dOpenclK3x3S2(const Tensor *input,
......@@ -122,8 +131,9 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
};
} // namespace kernels
......
......@@ -15,7 +15,8 @@ namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const DataType dt, Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -35,9 +36,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
uint32_t idx = 0;
......@@ -86,12 +85,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -102,11 +102,19 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} // namespace kernels
......
......@@ -8,17 +8,21 @@ namespace mace {
namespace kernels {
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
template <>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{nullptr, nullptr},
......@@ -38,7 +42,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......@@ -46,9 +50,9 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
if (paddings_[0] > 0 || paddings_[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
conv2d_func(&padded_input, filter, bias, output, future);
}else {
conv2d_func(input, filter, bias, output);
conv2d_func(input, filter, bias, output, future);
}
}
......
......@@ -14,7 +14,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const uint32_t stride,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
......@@ -30,7 +31,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
......@@ -57,26 +58,36 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future);
};
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future);
};
} // namespace kernels
......
......@@ -11,37 +11,43 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output);
const DataType dt, Tensor *output,
StatsFuture *future);
template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
......@@ -73,9 +79,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
conv2d_func(input, filter, bias, true, paddings.data(),
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
DataTypeToEnum<T>::value, output, future);
}
}
......
......@@ -17,7 +17,8 @@ static void Pooling(const Tensor *input,
const int pooling_size,
const PoolingType type,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
......@@ -25,7 +26,7 @@ static void Pooling(const Tensor *input,
index_t channel_blocks = (channels + 3) / 4;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
......@@ -85,12 +86,13 @@ static void Pooling(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -101,16 +103,27 @@ static void Pooling(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template<typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
......@@ -128,7 +141,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
output->ResizeImage(output_shape, output_image_shape);
Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
DataTypeToEnum<T>::value, output);
DataTypeToEnum<T>::value, output, future);
}
......
......@@ -14,7 +14,8 @@ namespace kernels {
template <typename T>
void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
......@@ -23,8 +24,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
......@@ -74,12 +74,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -90,10 +91,18 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -14,7 +14,7 @@ namespace kernels {
template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, Tensor *output) {
const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......@@ -38,7 +38,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -79,12 +79,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -95,11 +96,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
......@@ -17,8 +17,9 @@ template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get();
Tensor *batch_tensor,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
......@@ -42,12 +43,19 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(8),
static_cast<uint32_t>(128)};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} // namespace kernels
......
......@@ -6,6 +6,7 @@
#define MACE_KERNELS_POOLING_H
#include <limits>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
......@@ -49,7 +50,8 @@ struct PoolingFunctor : PoolingFunctorBase {
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
......@@ -153,7 +155,8 @@ struct PoolingFunctor : PoolingFunctorBase {
template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor);
Tensor *output_tensor,
StatsFuture *future);
template<typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
......@@ -166,7 +169,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
Tensor *output_tensor,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_RELU_H_
#define MACE_KERNELS_RELU_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -14,7 +15,7 @@ template <DeviceType D, typename T>
struct ReluFunctor {
T max_limit_;
void operator()(const Tensor *input, Tensor *output) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
index_t size = input->size();
......@@ -32,13 +33,14 @@ struct ReluFunctor {
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <typename T>
struct ReluFunctor<DeviceType::OPENCL, T> {
T max_limit_;
void operator()(const Tensor *input, Tensor *output);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
};
} // namespace kernels
......
......@@ -4,6 +4,7 @@
#ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -122,7 +123,7 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, Tensor *output) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......@@ -167,7 +168,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, Tensor *output);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -18,7 +19,8 @@ struct SpaceToBatchFunctor {
void operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
......@@ -29,7 +31,8 @@ template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output);
Tensor *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -24,8 +24,8 @@ def if_android_arm64(a):
"//conditions:default": [],
})
def if_profiling(a):
def if_profiling_enabled(a):
return select({
"//mace:is_profiling": a,
"//mace:profiling_enabled": a,
"//conditions:default": [],
})
})
......@@ -34,10 +34,7 @@ cc_library(
["*.h"],
exclude = ["ops_test_util.h"],
),
copts = [
"-std=c++11",
"-D_GLIBCXX_USE_C99_MATH_TR1",
],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [
"//mace/kernels",
],
......@@ -50,7 +47,7 @@ cc_test(
srcs = glob(
["*_test.cc"],
),
copts = ["-std=c++11"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
......@@ -64,12 +61,8 @@ cc_test(
name = "ops_benchmark",
testonly = 1,
srcs = glob(["*_benchmark.cc"]),
copts = [
"-std=c++11",
"-fopenmp",
"-D_GLIBCXX_USE_C99_MATH_TR1",
],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
":ops",
......
......@@ -16,7 +16,7 @@ class AddNOp : public Operator<D, T> {
AddNOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
Tensor *output_tensor = this->outputs_[0];
int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr);
......@@ -24,7 +24,7 @@ class AddNOp : public Operator<D, T> {
inputs[i] = this->inputs_[i];
}
functor_(inputs, output_tensor);
functor_(inputs, output_tensor, future);
return true;
}
......
......@@ -19,7 +19,7 @@ class BatchNormOp : public Operator<D, T> {
OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *scale = this->Input(SCALE);
const Tensor *offset = this->Input(OFFSET);
......@@ -40,7 +40,7 @@ class BatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, mean, var, output);
functor_(input, scale, offset, mean, var, output, future);
return true;
}
......
......@@ -88,7 +88,7 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
index_t height = 64;
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -129,7 +129,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
index_t height = 103;
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -172,7 +172,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -237,7 +237,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -303,7 +303,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -369,7 +369,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......
......@@ -53,14 +53,14 @@ class BatchToSpaceNDOp: public Operator<D, T> {
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *cropped_tensor = this->Input(CROPS);
Tensor *output = this->Output(OUTPUT);
BatchToSpaceHelper(input_tensor, block_shape_tensor, cropped_tensor, output);
functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor));
functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor), future);
return true;
}
......
......@@ -16,7 +16,7 @@ class BiasAddOp : public Operator<D, T> {
BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), functor_() {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *bias = this->Input(BIAS);
......@@ -28,7 +28,7 @@ class BiasAddOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, bias, output);
functor_(input, bias, output, future);
return true;
}
......
......@@ -68,7 +68,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
index_t width = 64 + rand() % 50;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input")
.Input("Bias")
......@@ -114,7 +114,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
index_t width = 113 + rand() % 100;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input")
.Input("Bias")
......
......@@ -16,14 +16,14 @@ class BufferToImageOp: public Operator<D, T> {
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output);
functor_(const_cast<Tensor *>(input_tensor), type, output, future);
return true;
}
......
......@@ -20,7 +20,7 @@ class ChannelShuffleOp : public Operator<D, T> {
group_(OperatorBase::GetSingleArgument<int>("group", 1)),
functor_(this->group_) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->shape()[1] % group_ == 0,
......@@ -29,7 +29,7 @@ class ChannelShuffleOp : public Operator<D, T> {
output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(),
output->mutable_data<T>());
output->mutable_data<T>(), future);
return true;
}
......
......@@ -10,7 +10,7 @@ class ChannelShuffleOpTest : public OpsTestBase {};
TEST_F(ChannelShuffleOpTest, C8G4) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("Input")
.Output("Output")
......
......@@ -17,7 +17,7 @@ class ConcatOp : public Operator<D, T> {
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
bool Run() override {
bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
......@@ -30,7 +30,7 @@ class ConcatOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
functor_(input_list, output);
functor_(input_list, output, future);
return true;
}
......
......@@ -12,7 +12,7 @@ class ConcatOpTest : public OpsTestBase {};
TEST_F(ConcatOpTest, CPUSimpleHorizon) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Concat", "ConcatTest")
.Input("Input0")
.Input("Input1")
......@@ -49,7 +49,7 @@ TEST_F(ConcatOpTest, CPUSimpleHorizon) {
TEST_F(ConcatOpTest, CPUSimpleVertical) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Concat", "ConcatTest")
.Input("Input0")
.Input("Input1")
......@@ -92,7 +92,7 @@ TEST_F(ConcatOpTest, CPURandom) {
int num_inputs = 2 + rand() % 10;
int axis = rand() % dim;
// Construct graph
auto &net = test_net();
OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) {
builder = builder.Input(("Input" + ToString(i)).c_str());
......
......@@ -22,13 +22,13 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data()) {
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -78,11 +78,12 @@ void TestSimple3x3SAME() {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#if __ARM_NEON
TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3VALID<DeviceType::NEON>();
TestSimple3x3SAME<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
......@@ -233,9 +234,11 @@ void TestSimple3x3WithoutBias() {
}
#ifdef __ARM_NEON
TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
......@@ -335,9 +338,11 @@ static void TestCombined3x3() {
}
#ifdef __ARM_NEON
TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
static void TestNHWCCombined3x3() {
......
......@@ -23,7 +23,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
functor_.dilations_ = this->dilations_.data();
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = nullptr;
......@@ -46,7 +46,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -96,27 +96,33 @@ void TestNxNS12(const index_t height, const index_t width) {
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) {
TestNxNS12<DeviceType::NEON>(4, 4);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
TestNxNS12<DeviceType::OPENCL>(4, 4);
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) {
TestNxNS12<DeviceType::NEON>(64, 64);
TestNxNS12<DeviceType::NEON>(128, 128);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(64, 64);
TestNxNS12<DeviceType::OPENCL>(128, 128);
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) {
TestNxNS12<DeviceType::NEON>(107, 113);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(107, 113);
......
......@@ -22,13 +22,13 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data()) {
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -16,7 +16,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
GlobalAvgPoolingOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
......@@ -29,7 +29,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
auto pooling_func = kernels::GlobalAvgPoolingFunctor<D, T>();
pooling_func(input->data<float>(), input->shape().data(),
output->mutable_data<float>());
output->mutable_data<float>(), future);
return true;
}
......
......@@ -10,7 +10,7 @@ class GlobalAvgPoolingOpTest : public OpsTestBase {};
TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
.Input("Input")
.Output("Output")
......@@ -32,9 +32,10 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#if __ARM_NEON
TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
.Input("Input")
.Output("Output")
......@@ -55,3 +56,4 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#endif
......@@ -16,13 +16,13 @@ class ImageToBufferOp: public Operator<D, T> {
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor));
functor_(output, type, const_cast<Tensor *>(input_tensor), future);
return true;
}
......
......@@ -176,7 +176,7 @@ class OpsTestNet {
void Sync() {
if (net_ && device_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
OpenCLRuntime::Global()->command_queue().finish();
}
}
......@@ -188,20 +188,14 @@ class OpsTestNet {
};
class OpsTestBase : public ::testing::Test {
public:
OpsTestNet &test_net() { return test_net_; };
protected:
virtual void TearDown() {
auto ws = test_net_.ws();
auto tensor_names = ws->Tensors();
for (auto &name : tensor_names) {
ws->RemoveTensor(name);
}
virtual void SetUp() {
// OpenCLRuntime::CreateGlobal();
}
private:
OpsTestNet test_net_;
virtual void TearDown() {
// OpenCLRuntime::DestroyGlobal();
}
};
template <typename T>
......
......@@ -23,11 +23,11 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
functor_(pooling_type_, kernels_.data(), this->strides_.data(),
this->padding_, this->dilations_.data()){};
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
functor_(input, output);
functor_(input, output, future);
return true;
};
......
......@@ -15,7 +15,7 @@ class PoolingOpTest : public OpsTestBase {};
TEST_F(PoolingOpTest, MAX_VALID) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -45,7 +45,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -71,7 +71,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -98,7 +98,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -245,7 +245,7 @@ TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......
......@@ -18,12 +18,12 @@ class ReluOp : public Operator<D, T> {
functor_.max_limit_ =
OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->inputs_[0];
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor);
functor_(input_tensor, output_tensor);
functor_(input_tensor, output_tensor, future);
return true;
}
......
......@@ -53,9 +53,11 @@ TEST_F(ReluOpTest, CPUSimple) {
TestSimple<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONSimple) {
TestSimple<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLSimple) {
TestSimple<DeviceType::OPENCL>();
......@@ -103,9 +105,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimple) {
TestUnalignedSimple<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONUnalignedSimple) {
TestUnalignedSimple<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
TestUnalignedSimple<DeviceType::OPENCL>();
......@@ -157,9 +161,11 @@ TEST_F(ReluOpTest, CPUSimpleReluX) {
TestSimpleReluX<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONSimpleReluX) {
TestSimpleReluX<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLSimpleReluX) {
TestSimpleReluX<DeviceType::OPENCL>();
......@@ -209,9 +215,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::OPENCL>();
......
......@@ -19,14 +19,14 @@ class ResizeBilinearOp : public Operator<D, T> {
OperatorBase::GetRepeatedArgument<index_t>("size", {-1, -1}),
OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
input->dim_size());
functor_(input, output);
functor_(input, output, future);
return true;
}
......
......@@ -73,7 +73,6 @@ static void ResizeBilinearBenchmark(int iters,
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
// SNPE 835 GPU: 6870us
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
......
......@@ -13,7 +13,7 @@ class ResizeBilinearTest : public OpsTestBase {};
TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
testing::internal::LogToStderr();
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("Input")
.Output("Output")
......@@ -37,7 +37,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
testing::internal::LogToStderr();
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("Input")
.Output("Output")
......
......@@ -52,14 +52,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *paddings_tensor = this->Input(PADDINGS);
Tensor *output = this->Output(OUTPUT);
SpaceToBatchHelper(input_tensor, block_shape_tensor, paddings_tensor, output);
functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output);
functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output, future);
return true;
}
......
......@@ -10,16 +10,6 @@ licenses(["notice"]) # Apache 2.0
load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
proto_library(
name = "stats",
srcs = ["stats.proto"],
)
cc_proto_library(
name = "stats_proto",
deps = [":stats"],
)
py_proto_library(
name = "mace_py",
srcs = ["mace.proto"],
......
syntax = "proto2";
package mace;
message OperatorStats {
optional string operator_name = 1;
optional string type = 2;
optional int64 all_start_micros = 3;
optional int64 op_start_rel_micros = 4;
optional int64 op_end_rel_micros = 5;
optional int64 all_end_rel_micros = 6;
};
message RunMetadata {
repeated OperatorStats op_stats = 1;
}
......@@ -43,12 +43,3 @@ py_binary(
"//mace/proto:mace_py",
],
)
py_binary(
name = "tf_ops_stats",
srcs = ["tf_ops_stats.py"],
srcs_version = "PY2AND3",
deps = [
"@six_archive//:six",
],
)
......@@ -7,15 +7,10 @@ cc_library(
name = "stat_summarizer",
srcs = ["stat_summarizer.cc"],
hdrs = ["stat_summarizer.h"],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android([
"-ldl",
"-lm",
]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkstatic = 1,
deps = [
"//mace/core",
"//mace/proto:stats_proto",
],
)
......@@ -24,8 +19,7 @@ cc_binary(
srcs = [
"benchmark_model.cc",
],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkstatic = 1,
deps = [
":stat_summarizer",
......
......@@ -4,7 +4,7 @@
#include "mace/tools/benchmark/stat_summarizer.h"
#include "mace/core/common.h"
#include "mace/proto/stats.pb.h"
#include "mace/core/operator.h"
#include <iomanip>
#include <queue>
......@@ -26,20 +26,21 @@ void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
int64_t curr_total_us = 0;
int64_t mem_total = 0;
int64_t first_node_start_us = run_metadata.op_stats(0).all_start_micros();
MACE_CHECK(!run_metadata.op_stats.empty());
int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
int node_num = 0;
for (const auto &ops : run_metadata.op_stats()) {
std::string name = ops.operator_name();
std::string op_type = ops.type();
for (const auto &ops : run_metadata.op_stats) {
std::string name = ops.operator_name;
std::string op_type = ops.type;
++node_num;
const int64_t curr_time = ops.all_end_rel_micros();
const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
curr_total_us += curr_time;
auto result = details_.emplace(name, Detail());
Detail *detail = &(result.first->second);
detail->start_us.UpdateStat(ops.all_start_micros() - first_node_start_us);
detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
detail->rel_end_us.UpdateStat(curr_time);
// If this is the first pass, initialize some values.
......
......@@ -10,36 +10,43 @@ licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android")
cc_library(
name = "command_line_flags",
name = "logging",
srcs = [
"command_line_flags.cc",
"logging.cc",
],
hdrs = [
"command_line_flags.h",
"logging.h",
],
copts = ["-std=c++11"],
deps = [
"//mace/core",
],
linkopts = if_android([
"-llog",
]),
)
cc_library(
name = "utils",
name = "command_line_flags",
srcs = [
"command_line_flags.cc",
],
hdrs = [
"utils.h",
"command_line_flags.h",
],
copts = ["-std=c++11"],
deps = [
":logging",
],
)
cc_library(
name = "tuner",
hdrs = [
"tuner.h",
"timer.h",
],
copts = ["-std=c++11"],
deps = [
"//mace/core",
"//mace/core:opencl_runtime",
":utils_hdrs",
":logging",
],
)
......@@ -50,7 +57,7 @@ cc_test(
"tuner_test.cc",
],
copts = ["-std=c++11"],
linkopts = if_android(["-lm", "-ldl"]),
linkopts = if_android(["-pie", "-lm"]),
linkstatic = 1,
deps = [
":tuner",
......@@ -58,3 +65,22 @@ cc_test(
"@gtest//:gtest_main",
],
)
cc_library(
name = "utils_hdrs",
hdrs = [
"utils.h",
"env_time.h",
],
copts = ["-std=c++11"],
)
cc_library(
name = "utils",
deps = [
":utils_hdrs",
":tuner",
":command_line_flags",
":logging",
],
)
......@@ -3,12 +3,16 @@
//
#include "mace/utils/command_line_flags.h"
#include "mace/utils/logging.h"
#include <cstring>
#include <iomanip>
namespace mace {
namespace {
using namespace std;
bool StringConsume(string &arg, const string &x) {
if ((arg.size() >= x.size()) &&
(memcmp(arg.data(), x.data(), x.size()) == 0)) {
......
......@@ -5,31 +5,32 @@
#ifndef MACE_CORE_COMMAND_LINE_FLAGS_H
#define MACE_CORE_COMMAND_LINE_FLAGS_H
#include "mace/core/common.h"
#include <string>
#include <vector>
namespace mace {
class Flag {
public:
Flag(const char *name, int *dst1, const string &usage_text);
Flag(const char *name, long long *dst1, const string &usage_text);
Flag(const char *name, bool *dst, const string &usage_text);
Flag(const char *name, string *dst, const string &usage_text);
Flag(const char *name, float *dst, const string &usage_text);
Flag(const char *name, int *dst1, const std::string &usage_text);
Flag(const char *name, long long *dst1, const std::string &usage_text);
Flag(const char *name, bool *dst, const std::string &usage_text);
Flag(const char *name, std::string *dst, const std::string &usage_text);
Flag(const char *name, float *dst, const std::string &usage_text);
private:
friend class Flags;
bool Parse(string arg, bool *value_parsing_ok) const;
bool Parse(std::string arg, bool *value_parsing_ok) const;
string name_;
std::string name_;
enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
int *int_value_;
long long *int64_value_;
bool *bool_value_;
string *string_value_;
std::string *string_value_;
float *float_value_;
string usage_text_;
std::string usage_text_;
};
class Flags {
......@@ -43,7 +44,7 @@ class Flags {
// Return a usage message with command line cmdline, and the
// usage_text strings in flag_list[].
static string Usage(const string &cmdline,
static std::string Usage(const std::string &cmdline,
const std::vector<Flag> &flag_list);
};
......
......@@ -2,19 +2,17 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
// Only support POSIX environment
#ifndef MACE_TESTING_TIME_H_
#define MACE_TESTING_TIME_H_
#ifndef MACE_UTILS_ENV_TIME_H
#define MACE_UTILS_ENV_TIME_H
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#include "mace/core/types.h"
namespace mace {
namespace testing {
namespace utils {
inline int64_t NowMicros() {
struct timeval tv;
......@@ -25,4 +23,4 @@ inline int64_t NowMicros() {
} // namespace testing
} // namespace mace
#endif // MACE_TESTING_TIME_H_
#endif // MACE_UTILS_ENV_TIME_H
......@@ -2,7 +2,7 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include <stdlib.h>
#if defined(PLATFORM_POSIX_ANDROID)
......
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_LOGGING_H_
#define MACE_CORE_LOGGING_H_
#ifndef MACE_UTILS_LOGGING_H_
#define MACE_UTILS_LOGGING_H_
#include <limits>
#include <sstream>
......@@ -150,4 +150,4 @@ T &&CheckNotNull(const char *file, int line, const char *exprtext, T &&t) {
} // namespace internal
} // namespace mace
#endif // MACE_CORE_LOGGING_H_
#endif // MACE_UTILS_LOGGING_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_UTILS_TIMER_H_
#define MACE_UTILS_TIMER_H_
#include "mace/utils/env_time.h"
namespace mace {
class Timer {
public:
virtual void StartTiming() = 0;
virtual void StopTiming() = 0;
virtual double ElapsedMicros() = 0;
};
class WallClockTimer : public Timer {
public:
void StartTiming() override {
start_micros_ = mace::utils::NowMicros();
}
void StopTiming() override {
stop_micros_ = mace::utils::NowMicros();
}
double ElapsedMicros() override {
return stop_micros_ - start_micros_;
}
private:
double start_micros_;
double stop_micros_;
};
} // namespace mace
#endif // MACE_UTILS_TIMER_H_
......@@ -5,44 +5,50 @@
#ifndef MACE_UTILS_TUNER_H_
#define MACE_UTILS_TUNER_H_
#include <stdlib.h>
#include <vector>
#include <fstream>
#include <functional>
#include <limits>
#include <string>
#include <unordered_map>
#include <fstream>
#include <limits>
#include <vector>
#include "mace/core/logging.h"
#include "mace/utils/utils.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/logging.h"
#include "mace/utils/timer.h"
namespace mace {
template<typename param_type>
template <typename param_type>
class Tuner {
public:
static Tuner* Get() {
static Tuner *Get() {
static Tuner tuner;
return &tuner;
}
template <typename RetType>
RetType TuneOrRun(const std::string param_key,
const std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()> &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func) {
inline bool IsTuning() {
const char *tuning = getenv("MACE_TUNING");
return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
}
template <typename RetType>
RetType TuneOrRun(
const std::string param_key,
const std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()>
&param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func,
Timer *timer) {
if (IsTuning() && param_generator != nullptr) {
// tune
std::vector<param_type> opt_param = default_param;
RetType res = Tune<RetType>(param_generator, func, opt_param);
RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
param_table_[param_key] = opt_param;
return res;
} else {
// run
if (param_table_.find(param_key) != param_table_.end()) {
VLOG(1) << param_key << ": " << internal::MakeString(param_table_[param_key]);
VLOG(1) << param_key << ": "
<< internal::MakeString(param_table_[param_key]);
return func(param_table_[param_key]);
} else {
return func(default_param);
......@@ -56,17 +62,10 @@ class Tuner {
ReadRunParamters();
}
~Tuner() {
WriteRunParameters();
}
Tuner(const Tuner&) = delete;
Tuner& operator=(const Tuner&) = delete;
~Tuner() { WriteRunParameters(); }
inline bool IsTuning() {
const char *tuning = getenv("MACE_TUNING");
return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
}
Tuner(const Tuner &) = delete;
Tuner &operator=(const Tuner &) = delete;
inline void WriteRunParameters() {
VLOG(1) << path_;
......@@ -83,7 +82,8 @@ class Tuner {
auto &params = kp.second;
int32_t params_size = params.size() * sizeof(param_type);
ofs.write(reinterpret_cast<char*>(&params_size), sizeof(params_size));
ofs.write(reinterpret_cast<char *>(&params_size),
sizeof(params_size));
for (auto &param : params) {
ofs.write(reinterpret_cast<char *>(&param), sizeof(params_size));
VLOG(1) << param;
......@@ -114,7 +114,7 @@ class Tuner {
params_count = params_size / sizeof(param_type);
std::vector<param_type> params(params_count);
for (int i = 0; i < params_count; ++i) {
ifs.read(reinterpret_cast<char*>(&params[i]), sizeof(param_type));
ifs.read(reinterpret_cast<char *>(&params[i]), sizeof(param_type));
}
param_table_.emplace(key, params);
}
......@@ -126,45 +126,47 @@ class Tuner {
}
template <typename RetType>
inline RetType Run(const std::function<RetType(const std::vector<param_type> &)> &func,
const std::vector<param_type> &params,
int num_runs,
double &time_us) {
inline RetType Run(
const std::function<RetType(const std::vector<param_type> &)> &func,
const std::vector<param_type> &params,
Timer *timer,
int num_runs,
double *time_us) {
RetType res;
int64_t total_time_us = 0;
for (int i = 0; i < num_runs; ++i) {
timer->StartTiming();
res = func(params);
OpenCLRuntime::Get()->command_queue().finish();
double start_time = OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000.0;
double end_time = OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000.0;
total_time_us += end_time - start_time;
timer->StopTiming();
total_time_us += timer->ElapsedMicros();
}
time_us = total_time_us * 1.0 / num_runs;
*time_us = total_time_us * 1.0 / num_runs;
return res;
}
template <typename RetType>
inline RetType Tune(const std::function<std::vector<std::vector<param_type>>()> &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func,
std::vector<param_type> &opt_params) {
OpenCLRuntime::EnableProfiling();
inline RetType Tune(
const std::function<std::vector<std::vector<param_type>>()>
&param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func,
Timer *timer,
std::vector<param_type> *opt_params) {
RetType res;
double opt_time = std::numeric_limits<double>::max();
auto params = param_generator();
for (const auto &param: params) {
for (const auto &param : params) {
double tmp_time = 0.0;
// warm up
Run<RetType>(func, param, 2, tmp_time);
Run<RetType>(func, param, timer, 2, &tmp_time);
// run
RetType tmp_res = Run<RetType>(func, param, 10, tmp_time);
RetType tmp_res = Run<RetType>(func, param, timer, 10, &tmp_time);
// Check the execution time
if (tmp_time < opt_time) {
opt_time = tmp_time;
opt_params = param;
*opt_params = param;
res = tmp_res;
}
}
......@@ -172,9 +174,9 @@ class Tuner {
}
private:
const char* path_;
const char *path_;
std::unordered_map<std::string, std::vector<param_type>> param_table_;
};
} // namespace mace
#endif // MACE_UTILS_TUNER_H_
} // namespace mace
#endif // MACE_UTILS_TUNER_H_
......@@ -28,13 +28,14 @@ TEST_F(TunerTest, SimpleRun) {
}
};
WallClockTimer timer;
std::vector<int> default_params(1, 1);
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
default_params[0] = 2;
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect+1, res);
}
......@@ -54,11 +55,12 @@ TEST_F(TunerTest, SimpleTune) {
return {{1}, {2}, {3}, {4}};
};
// tune
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc);
WallClockTimer timer;
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc, &timer);
EXPECT_EQ(expect, res);
// run
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res);
}
......
......@@ -21,10 +21,13 @@ ANDROID_ABI=armeabi-v7a
ANDROID_ABI=arm64-v8a
STRIP=""
STRIP="--strip always"
VLOG_LEVEL=0
PROFILINE="--define profiling=true"
# for profiling
bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI --define profiling=true
#bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
BRANCH=$(git symbolic-ref --short HEAD)
COMMIT_ID=$(git rev-parse --short HEAD)
bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET --crosstool_top=//external:android/crosstool --host_crosstool_top=@bazel_tools//tools/cpp:toolchain --cpu=$ANDROID_ABI
if [ $? -ne 0 ]; then
exit 1
......@@ -39,5 +42,5 @@ for device in `adb devices | grep "^[A-Za-z0-9]\+[[:space:]]\+device$"| cut -f1`
adb -s ${device} shell "mkdir -p $DEVICE_PATH"
adb -s ${device} push $CL_PATH $DEVICE_CL_PATH && \
adb -s ${device} push $BAZEL_BIN_PATH/$BIN_NAME $DEVICE_PATH && \
adb -s ${device} shell "MACE_KERNEL_PATH=$DEVICE_CL_PATH $DEVICE_PATH/$BIN_NAME $@"
adb -s ${device} shell "MACE_KERNEL_PATH=$DEVICE_CL_PATH MACE_CPP_MIN_VLOG_LEVEL=0$VLOG_LEVEL $DEVICE_PATH/$BIN_NAME $@"
done
......@@ -53,7 +53,7 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push mace/kernels/opencl/cl/ ${KERNEL_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册