提交 e4860f68 编写于 作者: L Liangliang He

Refactor OpenCL static dependencies

上级 2bee11ed
......@@ -25,7 +25,7 @@ config_setting(
)
config_setting(
name = "is_profiling",
name = "profiling_enabled",
define_values = {
"profiling": "true",
},
......
......@@ -7,7 +7,7 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android", "if_profiling")
load("//mace:mace.bzl", "if_android", "if_profiling_enabled")
cc_library(
name = "opencl_runtime",
......@@ -15,76 +15,48 @@ cc_library(
"runtime/opencl/*.cc",
]),
hdrs = glob([
"runtime/opencl/cl.hpp",
"runtime/opencl/cl2.hpp",
"runtime/opencl/*.h",
]),
copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] +
if_profiling_enabled(["-DMACE_OPENCL_PROFILING"]),
linkopts = ["-ldl"],
deps = [
":logging",
":core",
"//mace/utils:logging",
"//mace/utils:tuner",
"@opencl_headers//:opencl20_headers",
],
alwayslink = 1,
)
cc_library(
name = "logging",
srcs = [
"logging.cc",
],
hdrs = [
"logging.h",
],
copts = ["-std=c++11"],
linkopts = if_android([
"-llog",
]),
)
cc_library(
name = "core",
srcs = glob(
["*.cc",],
exclude=[
"logging.cc",
]),
hdrs = glob(
["*.h"],
exclude=[
"logging.h",
]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] + if_android([
"-D__USE_OPENCL",
]),
linkopts = ["-ldl"] + if_android([
"-pie",
]),
srcs = glob(["*.cc"]),
hdrs = glob(["*.h"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android(["-pie"]),
deps = [
":logging",
"//mace/proto:stats_proto",
"//mace/utils",
":opencl_runtime",
"//mace/utils:utils_hdrs",
"//mace/utils:logging",
],
)
# Main program for tests
cc_library(
name = "test_benchmark_main",
testonly = 1,
srcs = glob([
"testing/*.cc",
]),
hdrs = glob([
"testing/*.h",
]),
copts = [
"-std=c++11",
"-D_GLIBCXX_USE_C99_MATH_TR1",
hdrs = [
"testing/test_benchmark.h",
],
srcs = [
"testing/test_benchmark.cc",
"testing/test_benchmark_main.cc",
],
linkopts = ["-lm"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
alwayslink = 1,
deps = [
":core",
"//mace/utils:utils_hdrs",
],
alwayslink = 1,
)
......@@ -3,9 +3,6 @@
//
#include "mace/core/allocator.h"
#ifdef __USE_OPENCL
#include "mace/core/opencl_allocator.h"
#endif
namespace mace {
......@@ -25,8 +22,5 @@ Allocator *GetDeviceAllocator(DeviceType type) {
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
#ifdef __USE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
#endif
} // namespace mace
......@@ -12,7 +12,7 @@
#include <string>
#include <vector>
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
using std::set;
using std::map;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_FUTURE_H_
#define MACE_CORE_FUTURE_H_
#include <functional>
#include "mace/utils/logging.h"
namespace mace {
struct CallStats {
int64_t start_micros;
int64_t end_micros;
};
struct OperatorStats {
std::string operator_name;
std::string type;
CallStats stats;
};
struct RunMetadata {
std::vector<OperatorStats> op_stats;
};
// Wait the call to finish and get the stats if param is not nullptr
struct StatsFuture {
std::function<void(CallStats *)> wait_fn = [](CallStats *) {
LOG(FATAL) << "wait_fn must be properly set";
};
};
} // namespace mace
#endif // MACE_CORE_FUTURE_H_
......@@ -7,7 +7,7 @@
#include <cstdint>
#include <vector>
#include <string>
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
namespace mace {
......
......@@ -4,9 +4,6 @@
#include "mace/core/net.h"
#include "mace/utils/utils.h"
#ifdef __USE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
namespace mace {
......@@ -33,65 +30,51 @@ SimpleNet::SimpleNet(const std::shared_ptr<const NetDef> &net_def,
}
}
}
bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG(1) << "Running net " << name_;
for (auto &op : operators_) {
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
bool future_wait = (device_type_ == DeviceType::OPENCL &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
auto &op = *iter;
VLOG(1) << "Running operator " << op->debug_def().name() << "("
<< op->debug_def().type() << ").";
OperatorStats *op_stats = nullptr;
if (run_metadata ) {
if (device_type_ != DeviceType::OPENCL) {
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(NowInMicroSec());
op_stats->set_op_start_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
bool ret;
CallStats call_stats;
if (future_wait) {
StatsFuture future;
ret = op->Run(&future);
if (run_metadata != nullptr) {
future.wait_fn(&call_stats);
} else {
future.wait_fn(nullptr);
}
} else if (run_metadata != nullptr) {
call_stats.start_micros = NowInMicroSec();
ret = op->Run(nullptr);
call_stats.end_micros = NowInMicroSec();
} else {
ret = op->Run(nullptr);
}
if (!op->Run()) {
if (run_metadata != nullptr) {
OperatorStats op_stats = { op->debug_def().name(),
op->debug_def().type(),
call_stats };
run_metadata->op_stats.emplace_back(op_stats);
}
if (!ret) {
LOG(ERROR) << "Operator failed: " << op->debug_def().name();
return false;
}
if (run_metadata) {
if (device_type_ == DeviceType::OPENCL) {
#ifndef __USE_OPENCL
LOG(FATAL) << "OpenCL is not supported";
#else
OpenCLRuntime::Get()->command_queue().finish();
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(
OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000);
op_stats->set_op_start_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_op_end_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(
OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
#endif
} else {
op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
}
}
VLOG(1) << "Op " << op->debug_def().name()
<< " has shape: " << internal::MakeString(op->Output(0)->shape());
}
#ifdef __USE_OPENCL
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
#endif
return true;
}
......
......@@ -9,7 +9,6 @@
#include "mace/core/operator.h"
#include "mace/core/workspace.h"
#include "mace/core/mace.h"
#include "mace/proto/stats.pb.h"
namespace mace {
......
......@@ -7,6 +7,7 @@
#include "mace/core/common.h"
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
......@@ -55,7 +56,8 @@ class OperatorBase {
inline const vector<const Tensor *> &Inputs() const { return inputs_; }
inline const vector<Tensor *> &Outputs() { return outputs_; }
virtual bool Run() = 0;
// Run Op asynchronously (depends on device), return a future if not nullptr.
virtual bool Run(StatsFuture *future) = 0;
inline const OperatorDef &debug_def() const {
MACE_CHECK(has_debug_def(), "operator_def was null!");
......@@ -100,7 +102,7 @@ class Operator : public OperatorBase {
}
}
}
virtual bool Run() override = 0;
virtual bool Run(StatsFuture *future) override = 0;
~Operator() noexcept override {}
};
......
此差异已折叠。
......@@ -5,6 +5,8 @@
#ifndef MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_
#define MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_
// Do not include cl2.hpp directly, include this header instead.
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include "mace/core/runtime/opencl/cl2.hpp"
......
......@@ -3,7 +3,7 @@
//
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
......@@ -37,7 +37,7 @@ OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {}
void *OpenCLAllocator::New(size_t nbytes) {
cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Get()->context(),
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
nbytes, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
......@@ -53,7 +53,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(),
new cl::Image2D(OpenCLRuntime::Global()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
img_format,
image_shape[0], image_shape[1],
......@@ -79,7 +79,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
auto queue = OpenCLRuntime::Global()->command_queue();
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
......@@ -101,7 +101,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
mapped_image_pitch.resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
&mapped_image_pitch[0],
......@@ -114,12 +114,13 @@ void *OpenCLAllocator::MapImage(void *buffer,
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
auto queue = OpenCLRuntime::Global()->command_queue();
MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
nullptr) == CL_SUCCESS);
}
bool OpenCLAllocator::OnHost() { return false; }
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
} // namespace mace
......@@ -7,15 +7,17 @@
#include <memory>
#include <mutex>
#include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/logging.h"
#include "mace/utils/tuner.h"
#include <CL/opencl.h>
namespace mace {
namespace {
bool ReadFile(const std::string &filename, bool binary,
bool ReadFile(const std::string &filename,
bool binary,
std::vector<unsigned char> *content_ptr) {
MACE_CHECK_NOTNULL(content_ptr);
......@@ -55,7 +57,8 @@ bool ReadFile(const std::string &filename, bool binary,
return true;
}
bool WriteFile(const std::string &filename, bool binary,
bool WriteFile(const std::string &filename,
bool binary,
const std::vector<unsigned char> &content) {
std::ios_base::openmode mode = std::ios::out;
if (binary) {
......@@ -76,124 +79,92 @@ bool WriteFile(const std::string &filename, bool binary,
} // namespace
bool OpenCLRuntime::enable_profiling_ = false;
std::unique_ptr<cl::Event> OpenCLRuntime::profiling_ev_ = nullptr;
void OpenCLProfilingTimer::StartTiming() {}
OpenCLRuntime *OpenCLRuntime::Get() {
static std::once_flag init_once;
static OpenCLRuntime *instance = nullptr;
std::call_once(init_once, []() {
if (!mace::OpenCLLibrary::Supported()) {
LOG(ERROR) << "OpenCL not supported";
return;
}
void OpenCLProfilingTimer::StopTiming() {
OpenCLRuntime::Global()->command_queue().finish();
start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
}
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(ERROR) << "No OpenCL platforms found";
return;
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: "
<< default_platform.getInfo<CL_PLATFORM_NAME>() << ", "
<< default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(ERROR) << "No OpenCL devices found";
return;
}
double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0;
}
bool gpu_detected = false;
cl::Device gpu_device;
for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
gpu_device = device;
gpu_detected = true;
VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
break;
}
}
if (!gpu_detected) {
LOG(ERROR) << "No GPU device found";
return;
}
OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime instance;
return &instance;
}
cl_command_queue_properties properties = 0;
#ifdef __ENABLE_PROFILING
enable_profiling_ = true;
profiling_ev_.reset(new cl::Event());
properties = CL_QUEUE_PROFILING_ENABLE;
#endif
OpenCLRuntime::OpenCLRuntime() {
LoadOpenCLLibrary();
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device, properties);
instance = new OpenCLRuntime(context, gpu_device, command_queue);
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(FATAL) << "No OpenCL platforms found";
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>()
<< ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(FATAL) << "No OpenCL devices found";
}
});
bool gpu_detected = false;
cl::Device gpu_device;
for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
gpu_device = device;
gpu_detected = true;
VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
break;
}
}
if (!gpu_detected) {
LOG(FATAL) << "No GPU device found";
}
return instance;
}
cl_command_queue_properties properties = 0;
void OpenCLRuntime::EnableProfiling() { enable_profiling_ = true; }
#ifdef MACE_OPENCL_PROFILING
properties |= CL_QUEUE_PROFILING_ENABLE;
#endif
cl::Event *OpenCLRuntime::GetDefaultEvent() { return profiling_ev_.get(); }
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device, properties);
cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
}
const char *kernel_path = getenv("MACE_KERNEL_PATH");
this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
this->device_ = new cl::Device(gpu_device);
this->context_ = new cl::Context(context);
this->command_queue_ = new cl::CommandQueue(command_queue);
}
OpenCLRuntime::OpenCLRuntime(cl::Context context, cl::Device device,
cl::CommandQueue command_queue)
: context_(context), device_(device), command_queue_(command_queue) {
const char *kernel_path = getenv("MACE_KERNEL_PATH");
kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
OpenCLRuntime::~OpenCLRuntime() {
built_program_map_.clear();
delete command_queue_;
delete context_;
delete device_;
UnloadOpenCLLibrary();
}
OpenCLRuntime::~OpenCLRuntime() {}
cl::Context &OpenCLRuntime::context() { return context_; }
cl::Context &OpenCLRuntime::context() { return *context_; }
cl::Device &OpenCLRuntime::device() { return device_; }
cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
cl::Program &OpenCLRuntime::program() {
// TODO(liuqi) : useless, leave it for old code.
return program_;
}
// TODO(heliangliang) Support binary format
const std::map<std::string, std::string> OpenCLRuntime::program_map_ = {
{"addn", "addn.cl"},
{"batch_norm", "batch_norm.cl"},
{"bias_add", "bias_add.cl"},
{"buffer_to_image", "buffer_to_image.cl"},
{"conv_2d", "conv_2d.cl"},
{"conv_2d_1x1", "conv_2d_1x1.cl"},
{"conv_2d_3x3", "conv_2d_3x3.cl"},
{"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
{"pooling", "pooling.cl"},
{"relu", "relu.cl"},
{"concat", "concat.cl"},
{"resize_bilinear", "resize_bilinear.cl"},
{"space_to_batch", "space_to_batch.cl"},
};
std::string
OpenCLRuntime::GenerateCLBinaryFilenamePrefix(const std::string &filename_msg) {
std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix(
const std::string &filename_msg) {
std::string filename_prefix = filename_msg;
for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) {
if (*it == ' ' || *it == '-' || *it == '=') {
......@@ -262,7 +233,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
program_binary_sizes.get(), nullptr);
MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
new std::unique_ptr<unsigned char[]>[ device_list_size ]);
new std::unique_ptr<unsigned char[]>[device_list_size]);
for (cl_uint i = 0; i < device_list_size; ++i) {
program_binaries[i] = std::unique_ptr<unsigned char[]>(
new unsigned char[program_binary_sizes[i]]);
......@@ -281,16 +252,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
}
}
cl::Kernel
OpenCLRuntime::BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
auto kernel_program_it = program_map_.find(program_name);
if (kernel_program_it == program_map_.end()) {
MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
}
std::string program_file_name = kernel_program_it->second;
cl::Kernel OpenCLRuntime::BuildKernel(
const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
std::string program_file_name = program_name + ".cl";
std::string build_options_str;
for (auto &option : build_options) {
build_options_str += " " + option;
......@@ -312,15 +278,24 @@ OpenCLRuntime::BuildKernel(const std::string &program_name,
return cl::Kernel(program, kernel_name.c_str());
}
void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
if (stats != nullptr) {
stats->start_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
stats->end_micros =
event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
}
}
uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
unsigned long long size = 0;
device_.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
return static_cast<uint32_t>(size);
}
uint32_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
unsigned long long size = 0;
kernel.getWorkGroupInfo(device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
return static_cast<uint32_t>(size);
}
......
......@@ -10,36 +10,42 @@
#include <mutex>
#include <set>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
#include "mace/utils/timer.h"
namespace mace {
class OpenCLRuntime {
public:
static OpenCLRuntime *Get();
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
void StartTiming() override;
void StopTiming() override;
double ElapsedMicros() override;
static void EnableProfiling();
cl::Event *GetDefaultEvent();
cl_ulong GetEventProfilingStartInfo();
cl_ulong GetEventProfilingEndInfo();
private:
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
};
class OpenCLRuntime {
public:
static OpenCLRuntime *Global();
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
cl::Program &program();
void GetCallStats(const cl::Event &event, CallStats *stats);
uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
private:
OpenCLRuntime(cl::Context context,
cl::Device device,
cl::CommandQueue command_queue);
OpenCLRuntime();
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime&) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
......@@ -51,19 +57,14 @@ class OpenCLRuntime {
std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg);
private:
static bool enable_profiling_;
static std::unique_ptr<cl::Event> profiling_ev_;
cl::Context context_;
cl::Device device_;
cl::CommandQueue command_queue_;
cl::Program program_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
cl::Context *context_;
cl::Device *device_;
cl::CommandQueue *command_queue_;
std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_;
std::string kernel_path_;
static const std::map<std::string,
std::string> program_map_;
mutable std::map<std::string,
cl::Program> built_program_map_;
};
} // namespace mace
......
......@@ -7,13 +7,10 @@
namespace mace {
class OpenCLLibrary {
public:
static bool Supported();
static void Load();
static void Unload();
};
// These functions are not thread-safe.
void LoadOpenCLLibrary();
void UnloadOpenCLLibrary();
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
......@@ -7,7 +7,7 @@
#include "mace/core/allocator.h"
#include "mace/core/common.h"
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include "mace/core/types.h"
#include "mace/core/mace.h"
......
......@@ -9,8 +9,8 @@
#include <regex>
#include <vector>
#include "mace/core/logging.h"
#include "mace/core/testing/env_time.h"
#include "mace/utils/env_time.h"
#include "mace/utils/logging.h"
#include "mace/core/testing/test_benchmark.h"
namespace mace {
......@@ -82,7 +82,7 @@ void Benchmark::Run(const char *pattern) {
}
printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
printf("%s\n", string(width + 22, '-').c_str());
printf("%s\n", std::string(width + 22, '-').c_str());
for (auto b : *all_benchmarks) {
if (!std::regex_match(b->name_, match, regex)) continue;
for (auto arg : b->args_) {
......@@ -128,7 +128,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
int64_t iters = kMinIters;
while (true) {
accum_time = 0;
start_time = NowMicros();
start_time = utils::NowMicros();
bytes_processed = -1;
items_processed = -1;
label.clear();
......@@ -160,11 +160,11 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
void BytesProcessed(int64_t n) { bytes_processed = n; }
void ItemsProcessed(int64_t n) { items_processed = n; }
void StartTiming() {
if (start_time == 0) start_time = NowMicros();
if (start_time == 0) start_time = utils::NowMicros();
}
void StopTiming() {
if (start_time != 0) {
accum_time += (NowMicros() - start_time);
accum_time += (utils::NowMicros() - start_time);
start_time = 0;
}
}
......
......@@ -3,13 +3,12 @@
//
// Simple benchmarking facility.
#ifndef MACE_TEST_BENCHMARK_H_
#define MACE_TEST_BENCHMARK_H_
#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
#define MACE_CORE_TESTING_TEST_BENCHMARK_H_
#include <utility>
#include <vector>
#include "mace/core/types.h"
#include <string>
#define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
#define BENCHMARK(n) \
......@@ -31,7 +30,7 @@ class Benchmark {
static void Run(const char *pattern);
private:
string name_;
std::string name_;
int num_args_;
std::vector<std::pair<int, int>> args_;
void (*fn0_)(int) = nullptr;
......@@ -51,4 +50,4 @@ void StopTiming();
} // namespace testing
} // namespace mace
#endif // MACE_TEST_BENCHMARK_H_
#endif // MACE_CORE_TESTING_TEST_BENCHMARK_H_
......@@ -335,4 +335,4 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
return true;
}
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -3,7 +3,8 @@
//
#include "mace/dsp/hexagon_control_wrapper.h"
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include "mace/utils/env_time.h"
#include "gtest/gtest.h"
using namespace mace;
......@@ -27,17 +28,14 @@ TEST(HexagonControlerWrapper, InputFloat) {
}
wrapper.ResetPerfInfo();
timeval tv1, tv2;
gettimeofday(&tv1, NULL);
int64_t start_micros = utils::NowMicros();
int round = 10;
for (int i = 0; i < round; ++i) {
VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
}
gettimeofday(&tv2, NULL);
VLOG(0) << "avg duration: "
<< ((tv2.tv_sec - tv1.tv_sec) * 1000 +
(tv2.tv_usec - tv1.tv_usec) / 1000) /
round;
int64_t end_micros = utils::NowMicros();
VLOG(0) << "avg duration: " << (end_micros - start_micros) / (double)round
<< " ms";
wrapper.GetPerfInfo();
wrapper.PrintLog();
......@@ -95,4 +93,4 @@ TEST(HexagonControlerWrapper, PreQuantize) {
VLOG(0) << wrapper.TeardownGraph();
wrapper.Finalize();
}
\ No newline at end of file
}
......@@ -5,7 +5,7 @@
#ifndef MACE_HEXAGON_NN_OPS_H_
#define MACE_HEXAGON_NN_OPS_H_
#include "mace/core/logging.h"
#include "mace/utils/logging.h"
#include <unordered_map>
namespace mace {
......
......@@ -30,7 +30,7 @@ cc_test(
name = "util_test",
testonly = 1,
srcs = glob(["*_test.cc"]),
copts = ["-std=c++11"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = if_android([
"-ldl",
"-lm",
......
......@@ -6,11 +6,12 @@ cc_binary(
srcs = [
"helloworld.cc",
],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
deps = [
"//mace/core",
"//mace/ops",
"//mace/core:opencl_runtime",
],
)
......@@ -18,8 +19,8 @@ cc_test(
name = "benchmark_example",
testonly = 1,
srcs = ["benchmark_example.cc"],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
"//mace/core",
......@@ -30,8 +31,8 @@ cc_test(
cc_binary(
name = "mace_run",
srcs = glob(["models/*/*.cc"] + ["mace_run.cc"]),
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1", "-v", "-ftime-report"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
"//mace/core",
......
......@@ -3,6 +3,7 @@
//
#include "mace/core/net.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
using namespace mace;
......@@ -42,16 +43,10 @@ int main() {
net_def.add_op()->CopyFrom(op_def_1);
net_def.add_op()->CopyFrom(op_def_2);
auto input = net_def.add_tensors();
input->set_name("Input");
input->set_data_type(DataType::DT_FLOAT);
input->add_dims(2);
input->add_dims(3);
for (int i = 0; i < 6; ++i) {
input->add_float_data(i - 3);
}
VLOG(0) << net_def.DebugString();
alignas(4) unsigned char tensor_data[] = "012345678901234567890123";
const std::vector<int64_t> dims = {1, 2, 3, 1};
TensorProto input("Input", tensor_data, dims, DataType::DT_FLOAT);
net_def.mutable_tensors().push_back(input);
// Create workspace and input tensor
Workspace ws;
......
......@@ -25,8 +25,8 @@ cc_library(
linkopts = if_android(["-lm"]),
deps = [
"//mace/core",
"//mace/utils",
"//mace/utils:tuner",
"//mace/core:opencl_runtime",
"//mace/utils:utils_hdrs",
],
)
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_ADDN_H_
#define MACE_KERNELS_ADDN_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -15,7 +16,7 @@ struct AddNFunctorBase {};
template <DeviceType D, typename T>
struct AddNFunctor : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor) {
Tensor *output_tensor, StatsFuture *future) {
output_tensor->ResizeLike(input_tensors[0]);
Tensor::MappingGuard output_map(output_tensor);
index_t size = input_tensors[0]->size();
......@@ -38,12 +39,14 @@ struct AddNFunctor : AddNFunctorBase {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future);
template <typename T>
struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
void operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor);
Tensor *output_tensor, StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BATCH_NORM_H_
#define MACE_KERNELS_BATCH_NORM_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -20,7 +21,8 @@ struct BatchNormFunctor {
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
......@@ -80,7 +82,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> {
......@@ -91,7 +94,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -15,7 +16,8 @@ template <DeviceType D, typename T>
struct BiasAddFunctor {
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -51,14 +53,16 @@ template <>
void BiasAddFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
*/
template <typename T>
struct BiasAddFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
......@@ -22,7 +23,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
bool i2b_;
......@@ -34,7 +36,8 @@ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -15,7 +16,8 @@ class ChannelShuffleFunctor {
public:
ChannelShuffleFunctor(const int group) : group_(group) {}
void operator()(const T *input, const index_t *input_shape, T *output) {
void operator()(const T *input, const index_t *input_shape,
T *output, StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -44,4 +46,4 @@ class ChannelShuffleFunctor {
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CHANNEL_SHUFFLE_H_
\ No newline at end of file
#endif // MACE_KERNELS_CHANNEL_SHUFFLE_H_
......@@ -6,6 +6,7 @@
#define MACE_KERNELS_CONCAT_H_
#include "mace/core/common.h"
#include "mace/core/future.h"
#include "mace/core/types.h"
#include "mace/core/mace.h"
#include "mace/core/tensor.h"
......@@ -24,7 +25,8 @@ struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const Tensor *input0 = input_list.front();
const int inputs_count = input_list.size();
......@@ -78,7 +80,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output);
Tensor *output, StatsFuture *future);
};
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
......@@ -32,7 +33,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
......@@ -130,7 +132,8 @@ template<>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
template<typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
......@@ -142,7 +145,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
#define MACE_KERNELS_DEPTHWISE_CONV_H_
#include "mace/core/future.h"
#include "mace/core/common.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/core/mace.h"
......@@ -23,7 +24,8 @@ struct DepthwiseConv2dFunctor {
void operator()(const Tensor *input, // NCHW
const Tensor *filter, // c_out, c_in, kernel_h, kernel_w
const Tensor *bias, // c_out
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(bias);
......@@ -115,14 +117,16 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -33,8 +33,10 @@ struct FusedConv2dFunctor : FusedConv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
Tensor *output,
StatsFuture *future) {
Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias,
output, future);
T *output_data = output->mutable_data<T>();
T zero_value;
......@@ -62,7 +64,8 @@ struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
Tensor *output,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_GLOBAL_AVG_POOLING_H_
#define MACE_KERNELS_GLOBAL_AVG_POOLING_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -12,7 +13,10 @@ namespace kernels {
template <DeviceType D, typename T>
struct GlobalAvgPoolingFunctor {
void operator()(const T *input, const index_t *input_shape, T *output) {
void operator()(const T *input,
const index_t *input_shape,
T *output,
StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -35,9 +39,10 @@ struct GlobalAvgPoolingFunctor {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, float *output);
const float *input, const index_t *input_shape,
float *output, StatsFuture *future);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
\ No newline at end of file
#endif // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor,
StatsFuture *future) {
// TODO: neon mem copy
index_t size = output_tensor->size();
float *output_ptr = output_tensor->mutable_data<float>();
......
......@@ -15,7 +15,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X +
......
......@@ -44,7 +44,8 @@ template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
......@@ -79,7 +80,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......
......@@ -29,7 +29,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
......@@ -53,7 +54,7 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......@@ -77,4 +78,4 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
const float *input, const index_t *input_shape, float *output) {
const float *input, const index_t *input_shape,
float *output, StatsFuture *future) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -52,4 +53,4 @@ void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -56,7 +56,8 @@ extern void PoolingAvgNeonK3x3S2x2Padded(const float *input,
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
......@@ -122,9 +123,9 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
} else { // not implement yet
PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
padding_, dilations_)(
input_tensor, output_tensor);
input_tensor, output_tensor, future);
}
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -10,7 +10,8 @@ namespace kernels {
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
index_t size = input_tensor->size();
......@@ -66,4 +67,4 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
} // namespace mace
......@@ -13,7 +13,7 @@ namespace kernels {
template <typename T>
static void AddN(const std::vector<const Tensor *> &input_tensors,
Tensor *output) {
Tensor *output, StatsFuture *future) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
}
......@@ -26,7 +26,7 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -61,12 +61,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
{1, kwg_size}
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -77,16 +78,25 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template <typename T>
void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
size_t size = input_tensors.size();
MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
......@@ -108,7 +118,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
AddN<T>(input_tensors, output_tensor);
AddN<T>(input_tensors, output_tensor, future);
};
template
......
......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
......@@ -18,8 +19,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -27,7 +28,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -72,12 +73,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -88,10 +90,18 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -15,8 +15,8 @@ template <typename T>
void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -28,7 +28,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -43,12 +43,19 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
bias_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bias_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -12,7 +12,8 @@ namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image) {
Tensor *image,
StatsFuture *future) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape;
if (!i2b_) {
......@@ -31,7 +32,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
}
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
string kernel_name;
switch (type) {
case FILTER:
......@@ -64,12 +65,20 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
const std::vector<uint32_t> lws = {kwg_size, 1, 1};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
......
......@@ -14,7 +14,8 @@ namespace kernels {
static void Concat2(const Tensor *input0,
const Tensor *input1,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -22,7 +23,7 @@ static void Concat2(const Tensor *input0,
const int channel_blk = RoundUpDiv4(channel);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
if (input0->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
......@@ -73,12 +74,13 @@ static void Concat2(const Tensor *input0,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -89,15 +91,24 @@ static void Concat2(const Tensor *input0,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template<typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count == 2 && axis_ == 3)
<< "Concat opencl kernel only support two elements with axis == 3";
......@@ -124,7 +135,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
switch (inputs_count) {
case 2:
Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value, output);
Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future);
break;
default:MACE_NOT_IMPLEMENTED;
}
......
......@@ -11,37 +11,40 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output);
const DataType dt, Tensor *output,
StatsFuture *future);
template<typename T>
void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
......@@ -74,9 +77,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
conv2d_func(input, filter, bias, false, paddings.data(),
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
Conv2dOpencl(input, filter, bias, false, strides_[0],
paddings.data(), DataTypeToEnum<T>::value,
output, future);
}
}
......
......@@ -18,7 +18,8 @@ void Conv1x1(const Tensor *input,
const bool fused_relu,
const int stride,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -45,9 +46,7 @@ void Conv1x1(const Tensor *input,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
uint32_t idx = 0;
......@@ -92,12 +91,13 @@ void Conv1x1(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -108,11 +108,18 @@ void Conv1x1(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
extern void Conv2dOpenclK1x1S1(const Tensor *input,
......@@ -121,8 +128,9 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
Tensor *output,
StatsFuture *future) {
Conv1x1(input, filter, bias, fused_relu, 1, dt, output, future);
};
extern void Conv2dOpenclK1x1S2(const Tensor *input,
......@@ -131,8 +139,9 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
Tensor *output,
StatsFuture *future) {
Conv1x1(input, filter, bias, fused_relu, 2, dt, output, future);
};
} // namespace kernels
......
......@@ -15,7 +15,8 @@ namespace kernels {
static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const DataType dt, Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -35,9 +36,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
uint32_t idx = 0;
......@@ -84,12 +83,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -100,11 +100,19 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
......@@ -112,8 +120,9 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
};
void Conv2dOpenclK3x3S2(const Tensor *input,
......@@ -122,8 +131,9 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
const bool fused_relu,
const int *padding,
const DataType dt,
Tensor *output) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
Tensor *output,
StatsFuture *future) {
Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
};
} // namespace kernels
......
......@@ -15,7 +15,8 @@ namespace kernels {
void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output) {
const DataType dt, Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
......@@ -35,9 +36,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
built_options.emplace("-DFUSED_RELU");
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
uint32_t idx = 0;
......@@ -86,12 +85,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -102,11 +102,19 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} // namespace kernels
......
......@@ -8,17 +8,21 @@ namespace mace {
namespace kernels {
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
template <>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
const Tensor *bias, Tensor *output,
StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{nullptr, nullptr},
......@@ -38,7 +42,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
input, filter, bias, output, future);
return;
}
......@@ -46,9 +50,9 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
if (paddings_[0] > 0 || paddings_[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
conv2d_func(&padded_input, filter, bias, output, future);
}else {
conv2d_func(input, filter, bias, output);
conv2d_func(input, filter, bias, output, future);
}
}
......
......@@ -14,7 +14,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const uint32_t stride,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
......@@ -30,7 +31,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
......@@ -57,26 +58,36 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future);
};
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
Tensor *output,
StatsFuture *future) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future);
};
} // namespace kernels
......
......@@ -11,37 +11,43 @@ namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output,
StatsFuture *future);
extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const uint32_t stride, const int *padding,
const DataType dt, Tensor *output);
const DataType dt, Tensor *output,
StatsFuture *future);
template<typename T>
void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, const bool fused_relu,
const int *padding, const DataType dt,
Tensor *output);
Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
......@@ -73,9 +79,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 5 &&
selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
conv2d_func(input, filter, bias, true, paddings.data(),
DataTypeToEnum<T>::value, output, future);
} else {
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
DataTypeToEnum<T>::value, output, future);
}
}
......
......@@ -17,7 +17,8 @@ static void Pooling(const Tensor *input,
const int pooling_size,
const PoolingType type,
const DataType dt,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
......@@ -25,7 +26,7 @@ static void Pooling(const Tensor *input,
index_t channel_blocks = (channels + 3) / 4;
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
if (type == MAX && input->dtype() == output->dtype()) {
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
......@@ -85,12 +86,13 @@ static void Pooling(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -101,16 +103,27 @@ static void Pooling(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template<typename T>
void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
Tensor *output,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape = {
......@@ -128,7 +141,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
output->ResizeImage(output_shape, output_image_shape);
Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
DataTypeToEnum<T>::value, output);
DataTypeToEnum<T>::value, output, future);
}
......
......@@ -14,7 +14,8 @@ namespace kernels {
template <typename T>
void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output) {
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
......@@ -23,8 +24,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
......@@ -74,12 +74,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -90,10 +91,18 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template
......
......@@ -14,7 +14,7 @@ namespace kernels {
template <typename T>
void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input, Tensor *output) {
const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......@@ -38,7 +38,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Get();
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
......@@ -79,12 +79,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......@@ -95,11 +96,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
func,
&timer);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
......@@ -17,8 +17,9 @@ template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get();
Tensor *batch_tensor,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
......@@ -42,12 +43,19 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(8),
static_cast<uint32_t>(128)};
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS);
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} // namespace kernels
......
......@@ -6,6 +6,7 @@
#define MACE_KERNELS_POOLING_H
#include <limits>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
......@@ -49,7 +50,8 @@ struct PoolingFunctor : PoolingFunctorBase {
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
......@@ -153,7 +155,8 @@ struct PoolingFunctor : PoolingFunctorBase {
template<>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor,
Tensor *output_tensor);
Tensor *output_tensor,
StatsFuture *future);
template<typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
......@@ -166,7 +169,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
strides, padding,
dilations) {}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor);
Tensor *output_tensor,
StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_RELU_H_
#define MACE_KERNELS_RELU_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -14,7 +15,7 @@ template <DeviceType D, typename T>
struct ReluFunctor {
T max_limit_;
void operator()(const Tensor *input, Tensor *output) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
index_t size = input->size();
......@@ -32,13 +33,14 @@ struct ReluFunctor {
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor *output);
Tensor *output,
StatsFuture *future);
template <typename T>
struct ReluFunctor<DeviceType::OPENCL, T> {
T max_limit_;
void operator()(const Tensor *input, Tensor *output);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
};
} // namespace kernels
......
......@@ -4,6 +4,7 @@
#ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
namespace mace {
......@@ -122,7 +123,7 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, Tensor *output) {
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......@@ -167,7 +168,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBilinearFunctorBase(size, align_corners) {}
void operator()(const Tensor *input, Tensor *output);
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/mace.h"
......@@ -18,7 +19,8 @@ struct SpaceToBatchFunctor {
void operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output_tensor) {
Tensor *output_tensor,
StatsFuture *future) {
MACE_NOT_IMPLEMENTED;
}
......@@ -29,7 +31,8 @@ template <>
void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *input_tensor,
const Tensor *block_shape_tensor,
const Tensor *paddings_tensor,
Tensor *output);
Tensor *output,
StatsFuture *future);
} // namespace kernels
} // namespace mace
......
......@@ -24,8 +24,8 @@ def if_android_arm64(a):
"//conditions:default": [],
})
def if_profiling(a):
def if_profiling_enabled(a):
return select({
"//mace:is_profiling": a,
"//mace:profiling_enabled": a,
"//conditions:default": [],
})
})
......@@ -34,10 +34,7 @@ cc_library(
["*.h"],
exclude = ["ops_test_util.h"],
),
copts = [
"-std=c++11",
"-D_GLIBCXX_USE_C99_MATH_TR1",
],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
deps = [
"//mace/kernels",
],
......@@ -50,7 +47,7 @@ cc_test(
srcs = glob(
["*_test.cc"],
),
copts = ["-std=c++11"],
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
......@@ -64,12 +61,8 @@ cc_test(
name = "ops_benchmark",
testonly = 1,
srcs = glob(["*_benchmark.cc"]),
copts = [
"-std=c++11",
"-fopenmp",
"-D_GLIBCXX_USE_C99_MATH_TR1",
],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
":ops",
......
......@@ -16,7 +16,7 @@ class AddNOp : public Operator<D, T> {
AddNOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
Tensor *output_tensor = this->outputs_[0];
int n = this->inputs_.size();
vector<const Tensor *> inputs(n, nullptr);
......@@ -24,7 +24,7 @@ class AddNOp : public Operator<D, T> {
inputs[i] = this->inputs_[i];
}
functor_(inputs, output_tensor);
functor_(inputs, output_tensor, future);
return true;
}
......
......@@ -19,7 +19,7 @@ class BatchNormOp : public Operator<D, T> {
OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *scale = this->Input(SCALE);
const Tensor *offset = this->Input(OFFSET);
......@@ -40,7 +40,7 @@ class BatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, mean, var, output);
functor_(input, scale, offset, mean, var, output, future);
return true;
}
......
......@@ -88,7 +88,7 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
index_t height = 64;
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -129,7 +129,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
index_t height = 103;
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -172,7 +172,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -237,7 +237,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
index_t width = 64;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -303,7 +303,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......@@ -369,7 +369,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
index_t width = 113;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
......
......@@ -53,14 +53,14 @@ class BatchToSpaceNDOp: public Operator<D, T> {
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *cropped_tensor = this->Input(CROPS);
Tensor *output = this->Output(OUTPUT);
BatchToSpaceHelper(input_tensor, block_shape_tensor, cropped_tensor, output);
functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor));
functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor), future);
return true;
}
......
......@@ -16,7 +16,7 @@ class BiasAddOp : public Operator<D, T> {
BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), functor_() {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *bias = this->Input(BIAS);
......@@ -28,7 +28,7 @@ class BiasAddOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, bias, output);
functor_(input, bias, output, future);
return true;
}
......
......@@ -68,7 +68,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
index_t width = 64 + rand() % 50;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input")
.Input("Bias")
......@@ -114,7 +114,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
index_t width = 113 + rand() % 100;
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input")
.Input("Bias")
......
......@@ -16,14 +16,14 @@ class BufferToImageOp: public Operator<D, T> {
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output);
functor_(const_cast<Tensor *>(input_tensor), type, output, future);
return true;
}
......
......@@ -20,7 +20,7 @@ class ChannelShuffleOp : public Operator<D, T> {
group_(OperatorBase::GetSingleArgument<int>("group", 1)),
functor_(this->group_) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
MACE_CHECK(input->shape()[1] % group_ == 0,
......@@ -29,7 +29,7 @@ class ChannelShuffleOp : public Operator<D, T> {
output->ResizeLike(input);
functor_(input->data<T>(), input->shape().data(),
output->mutable_data<T>());
output->mutable_data<T>(), future);
return true;
}
......
......@@ -10,7 +10,7 @@ class ChannelShuffleOpTest : public OpsTestBase {};
TEST_F(ChannelShuffleOpTest, C8G4) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("Input")
.Output("Output")
......
......@@ -17,7 +17,7 @@ class ConcatOp : public Operator<D, T> {
: Operator<D, T>(op_def, ws),
functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
bool Run() override {
bool Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
const std::vector<const Tensor *> input_list = this->Inputs();
const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
......@@ -30,7 +30,7 @@ class ConcatOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
functor_(input_list, output);
functor_(input_list, output, future);
return true;
}
......
......@@ -12,7 +12,7 @@ class ConcatOpTest : public OpsTestBase {};
TEST_F(ConcatOpTest, CPUSimpleHorizon) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Concat", "ConcatTest")
.Input("Input0")
.Input("Input1")
......@@ -49,7 +49,7 @@ TEST_F(ConcatOpTest, CPUSimpleHorizon) {
TEST_F(ConcatOpTest, CPUSimpleVertical) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Concat", "ConcatTest")
.Input("Input0")
.Input("Input1")
......@@ -92,7 +92,7 @@ TEST_F(ConcatOpTest, CPURandom) {
int num_inputs = 2 + rand() % 10;
int axis = rand() % dim;
// Construct graph
auto &net = test_net();
OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) {
builder = builder.Input(("Input" + ToString(i)).c_str());
......
......@@ -22,13 +22,13 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data()) {
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -78,11 +78,12 @@ void TestSimple3x3SAME() {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#if __ARM_NEON
TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3VALID<DeviceType::NEON>();
TestSimple3x3SAME<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
......@@ -233,9 +234,11 @@ void TestSimple3x3WithoutBias() {
}
#ifdef __ARM_NEON
TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
......@@ -335,9 +338,11 @@ static void TestCombined3x3() {
}
#ifdef __ARM_NEON
TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3<DeviceType::NEON>();
}
#endif
template<DeviceType D, typename T>
static void TestNHWCCombined3x3() {
......
......@@ -23,7 +23,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
functor_.dilations_ = this->dilations_.data();
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = nullptr;
......@@ -46,7 +46,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -96,27 +96,33 @@ void TestNxNS12(const index_t height, const index_t width) {
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) {
TestNxNS12<DeviceType::NEON>(4, 4);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
TestNxNS12<DeviceType::OPENCL>(4, 4);
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) {
TestNxNS12<DeviceType::NEON>(64, 64);
TestNxNS12<DeviceType::NEON>(128, 128);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(64, 64);
TestNxNS12<DeviceType::OPENCL>(128, 128);
}
#if __ARM_NEON
TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) {
TestNxNS12<DeviceType::NEON>(107, 113);
}
#endif
TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
TestNxNS12<DeviceType::OPENCL>(107, 113);
......
......@@ -22,13 +22,13 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
this->dilations_.data()) {
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
functor_(input, filter, bias, output);
functor_(input, filter, bias, output, future);
return true;
}
......
......@@ -16,7 +16,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
GlobalAvgPoolingOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
......@@ -29,7 +29,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
auto pooling_func = kernels::GlobalAvgPoolingFunctor<D, T>();
pooling_func(input->data<float>(), input->shape().data(),
output->mutable_data<float>());
output->mutable_data<float>(), future);
return true;
}
......
......@@ -10,7 +10,7 @@ class GlobalAvgPoolingOpTest : public OpsTestBase {};
TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
.Input("Input")
.Output("Output")
......@@ -32,9 +32,10 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#if __ARM_NEON
TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
.Input("Input")
.Output("Output")
......@@ -55,3 +56,4 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
#endif
......@@ -16,13 +16,13 @@ class ImageToBufferOp: public Operator<D, T> {
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor));
functor_(output, type, const_cast<Tensor *>(input_tensor), future);
return true;
}
......
......@@ -176,7 +176,7 @@ class OpsTestNet {
void Sync() {
if (net_ && device_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
OpenCLRuntime::Global()->command_queue().finish();
}
}
......@@ -188,20 +188,14 @@ class OpsTestNet {
};
class OpsTestBase : public ::testing::Test {
public:
OpsTestNet &test_net() { return test_net_; };
protected:
virtual void TearDown() {
auto ws = test_net_.ws();
auto tensor_names = ws->Tensors();
for (auto &name : tensor_names) {
ws->RemoveTensor(name);
}
virtual void SetUp() {
// OpenCLRuntime::CreateGlobal();
}
private:
OpsTestNet test_net_;
virtual void TearDown() {
// OpenCLRuntime::DestroyGlobal();
}
};
template <typename T>
......
......@@ -23,11 +23,11 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
functor_(pooling_type_, kernels_.data(), this->strides_.data(),
this->padding_, this->dilations_.data()){};
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
functor_(input, output);
functor_(input, output, future);
return true;
};
......
......@@ -15,7 +15,7 @@ class PoolingOpTest : public OpsTestBase {};
TEST_F(PoolingOpTest, MAX_VALID) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -45,7 +45,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -71,7 +71,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -98,7 +98,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......@@ -245,7 +245,7 @@ TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
......
......@@ -18,12 +18,12 @@ class ReluOp : public Operator<D, T> {
functor_.max_limit_ =
OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->inputs_[0];
Tensor *output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor);
functor_(input_tensor, output_tensor);
functor_(input_tensor, output_tensor, future);
return true;
}
......
......@@ -53,9 +53,11 @@ TEST_F(ReluOpTest, CPUSimple) {
TestSimple<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONSimple) {
TestSimple<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLSimple) {
TestSimple<DeviceType::OPENCL>();
......@@ -103,9 +105,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimple) {
TestUnalignedSimple<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONUnalignedSimple) {
TestUnalignedSimple<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
TestUnalignedSimple<DeviceType::OPENCL>();
......@@ -157,9 +161,11 @@ TEST_F(ReluOpTest, CPUSimpleReluX) {
TestSimpleReluX<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONSimpleReluX) {
TestSimpleReluX<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLSimpleReluX) {
TestSimpleReluX<DeviceType::OPENCL>();
......@@ -209,9 +215,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::CPU>();
}
#if __ARM_NEON
TEST_F(ReluOpTest, NEONUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::NEON>();
}
#endif
TEST_F(ReluOpTest, OPENCLUnalignedSimpleReluX) {
TestUnalignedSimpleReluX<DeviceType::OPENCL>();
......
......@@ -19,14 +19,14 @@ class ResizeBilinearOp : public Operator<D, T> {
OperatorBase::GetRepeatedArgument<index_t>("size", {-1, -1}),
OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
input->dim_size());
functor_(input, output);
functor_(input, output, future);
return true;
}
......
......@@ -73,7 +73,6 @@ static void ResizeBilinearBenchmark(int iters,
BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);
// SNPE 835 GPU: 6870us
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);
BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);
......
......@@ -13,7 +13,7 @@ class ResizeBilinearTest : public OpsTestBase {};
TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
testing::internal::LogToStderr();
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("Input")
.Output("Output")
......@@ -37,7 +37,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
testing::internal::LogToStderr();
// Construct graph
auto &net = test_net();
OpsTestNet net;
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("Input")
.Output("Output")
......
......@@ -52,14 +52,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
bool Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT);
const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
const Tensor *paddings_tensor = this->Input(PADDINGS);
Tensor *output = this->Output(OUTPUT);
SpaceToBatchHelper(input_tensor, block_shape_tensor, paddings_tensor, output);
functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output);
functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output, future);
return true;
}
......
......@@ -10,16 +10,6 @@ licenses(["notice"]) # Apache 2.0
load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
proto_library(
name = "stats",
srcs = ["stats.proto"],
)
cc_proto_library(
name = "stats_proto",
deps = [":stats"],
)
py_proto_library(
name = "mace_py",
srcs = ["mace.proto"],
......
syntax = "proto2";
package mace;
message OperatorStats {
optional string operator_name = 1;
optional string type = 2;
optional int64 all_start_micros = 3;
optional int64 op_start_rel_micros = 4;
optional int64 op_end_rel_micros = 5;
optional int64 all_end_rel_micros = 6;
};
message RunMetadata {
repeated OperatorStats op_stats = 1;
}
......@@ -43,12 +43,3 @@ py_binary(
"//mace/proto:mace_py",
],
)
py_binary(
name = "tf_ops_stats",
srcs = ["tf_ops_stats.py"],
srcs_version = "PY2AND3",
deps = [
"@six_archive//:six",
],
)
......@@ -7,15 +7,10 @@ cc_library(
name = "stat_summarizer",
srcs = ["stat_summarizer.cc"],
hdrs = ["stat_summarizer.h"],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android([
"-ldl",
"-lm",
]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkstatic = 1,
deps = [
"//mace/core",
"//mace/proto:stats_proto",
],
)
......@@ -24,8 +19,7 @@ cc_binary(
srcs = [
"benchmark_model.cc",
],
copts = ["-std=c++11"],
linkopts = ["-fopenmp"] + if_android(["-ldl"]),
copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
linkstatic = 1,
deps = [
":stat_summarizer",
......
......@@ -4,7 +4,7 @@
#include "mace/tools/benchmark/stat_summarizer.h"
#include "mace/core/common.h"
#include "mace/proto/stats.pb.h"
#include "mace/core/operator.h"
#include <iomanip>
#include <queue>
......@@ -26,20 +26,21 @@ void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
int64_t curr_total_us = 0;
int64_t mem_total = 0;
int64_t first_node_start_us = run_metadata.op_stats(0).all_start_micros();
MACE_CHECK(!run_metadata.op_stats.empty());
int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;
int node_num = 0;
for (const auto &ops : run_metadata.op_stats()) {
std::string name = ops.operator_name();
std::string op_type = ops.type();
for (const auto &ops : run_metadata.op_stats) {
std::string name = ops.operator_name;
std::string op_type = ops.type;
++node_num;
const int64_t curr_time = ops.all_end_rel_micros();
const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
curr_total_us += curr_time;
auto result = details_.emplace(name, Detail());
Detail *detail = &(result.first->second);
detail->start_us.UpdateStat(ops.all_start_micros() - first_node_start_us);
detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
detail->rel_end_us.UpdateStat(curr_time);
// If this is the first pass, initialize some values.
......
......@@ -10,36 +10,43 @@ licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android")
cc_library(
name = "command_line_flags",
name = "logging",
srcs = [
"command_line_flags.cc",
"logging.cc",
],
hdrs = [
"command_line_flags.h",
"logging.h",
],
copts = ["-std=c++11"],
deps = [
"//mace/core",
],
linkopts = if_android([
"-llog",
]),
)
cc_library(
name = "utils",
name = "command_line_flags",
srcs = [
"command_line_flags.cc",
],
hdrs = [
"utils.h",
"command_line_flags.h",
],
copts = ["-std=c++11"],
deps = [
":logging",
],
)
cc_library(
name = "tuner",
hdrs = [
"tuner.h",
"timer.h",
],
copts = ["-std=c++11"],
deps = [
"//mace/core",
"//mace/core:opencl_runtime",
":utils_hdrs",
":logging",
],
)
......@@ -50,7 +57,7 @@ cc_test(
"tuner_test.cc",
],
copts = ["-std=c++11"],
linkopts = if_android(["-lm", "-ldl"]),
linkopts = if_android(["-pie", "-lm"]),
linkstatic = 1,
deps = [
":tuner",
......@@ -58,3 +65,22 @@ cc_test(
"@gtest//:gtest_main",
],
)
cc_library(
name = "utils_hdrs",
hdrs = [
"utils.h",
"env_time.h",
],
copts = ["-std=c++11"],
)
cc_library(
name = "utils",
deps = [
":utils_hdrs",
":tuner",
":command_line_flags",
":logging",
],
)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册