提交 bf78cf25 编写于 作者: L Liangliang He

Add OpenCL runtime

上级 b646fd87
...@@ -12,22 +12,29 @@ load("//mace:mace.bzl", "if_android") ...@@ -12,22 +12,29 @@ load("//mace:mace.bzl", "if_android")
cc_library( cc_library(
name = "opencl_runtime", name = "opencl_runtime",
srcs = glob([ srcs = glob([
"platform/opencl/cl.hpp", "runtime/opencl/cl.hpp",
"platform/opencl/cl2.hpp", "runtime/opencl/cl2.hpp",
"platform/opencl/opencl_wrapper.h", "runtime/opencl/opencl_allocator.cc",
"platform/opencl/opencl_wrapper.cc", "runtime/opencl/opencl_wrapper.cc",
"runtime/opencl/opencl_runtime.cc",
]), ]),
hdrs = [
"runtime/opencl/opencl_allocator.h",
"runtime/opencl/opencl_runtime.h",
"runtime/opencl/opencl_wrapper.h",
],
copts = ["-std=c++11"], copts = ["-std=c++11"],
deps = [ deps = [
"@opencl_headers//:opencl20_headers",
"core", "core",
"@opencl_headers//:opencl20_headers",
], ],
alwayslink = 1,
) )
cc_binary( cc_binary(
name = "opencl_smoketest", name = "opencl_smoketest",
srcs = glob([ srcs = glob([
"platform/opencl/opencl_smoketest.cc", "runtime/opencl/opencl_smoketest.cc",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11"],
deps = [ deps = [
...@@ -44,15 +51,15 @@ cc_library( ...@@ -44,15 +51,15 @@ cc_library(
"*.h", "*.h",
]), ]),
copts = ["-std=c++11"], copts = ["-std=c++11"],
deps = [
"//mace/proto:cc_proto",
"//mace/proto:stats_proto",
"//mace/utils:utils",
],
linkopts = if_android([ linkopts = if_android([
"-llog", "-llog",
"-pie", "-pie",
]), ]),
deps = [
"//mace/proto:cc_proto",
"//mace/proto:stats_proto",
"//mace/utils",
],
) )
# Main program for tests # Main program for tests
......
...@@ -6,20 +6,21 @@ ...@@ -6,20 +6,21 @@
namespace mace { namespace mace {
static std::unique_ptr<CPUAllocator> g_cpu_allocator(new CPUAllocator()); std::map<int32_t, Allocator *> *gAllocatorRegistry() {
CPUAllocator* cpu_allocator() { return g_cpu_allocator.get(); } static std::map<int32_t, Allocator *> g_allocator_registry;
return &g_allocator_registry;
void SetCPUAllocator(CPUAllocator* alloc) { g_cpu_allocator.reset(alloc); } }
Allocator* GetDeviceAllocator(DeviceType type) { Allocator *GetDeviceAllocator(DeviceType type) {
switch (type) { auto iter = gAllocatorRegistry()->find(type);
case DeviceType::CPU: if (iter == gAllocatorRegistry()->end()) {
case DeviceType::NEON: LOG(ERROR) << "Allocator not found for device " << type;
return cpu_allocator();
default:
MACE_CHECK(false, "device type ", type, " is not supported.");
}
return nullptr; return nullptr;
}
return iter->second;
} }
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
} // namespace mace } // namespace mace
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <malloc.h> #include <malloc.h>
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/registry.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
namespace mace { namespace mace {
...@@ -24,17 +25,19 @@ class Allocator { ...@@ -24,17 +25,19 @@ class Allocator {
public: public:
Allocator() {} Allocator() {}
virtual ~Allocator() noexcept {} virtual ~Allocator() noexcept {}
virtual void* New(size_t nbytes) = 0; virtual void *New(size_t nbytes) = 0;
virtual void Delete(void* data) = 0; virtual void Delete(void *data) = 0;
virtual void CopyBytes(void* dst, const void* src, size_t size) = 0; virtual void *Map(void *buffer, size_t nbytes) = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
virtual bool OnHost() = 0;
template <typename T> template <typename T>
T* New(size_t num_elements) { T *New(size_t num_elements) {
if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) { if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) {
return nullptr; return nullptr;
} }
void* p = New(sizeof(T) * num_elements); void *p = New(sizeof(T) * num_elements);
T* typed_p = reinterpret_cast<T*>(p); T *typed_p = reinterpret_cast<T *>(p);
return typed_p; return typed_p;
} }
}; };
...@@ -42,8 +45,8 @@ class Allocator { ...@@ -42,8 +45,8 @@ class Allocator {
class CPUAllocator : public Allocator { class CPUAllocator : public Allocator {
public: public:
~CPUAllocator() override {} ~CPUAllocator() override {}
void* New(size_t nbytes) override { void *New(size_t nbytes) override {
void* data = nullptr; void *data = nullptr;
#ifdef __ANDROID__ #ifdef __ANDROID__
data = memalign(kMaceAlignment, nbytes); data = memalign(kMaceAlignment, nbytes);
#else #else
...@@ -55,33 +58,32 @@ class CPUAllocator : public Allocator { ...@@ -55,33 +58,32 @@ class CPUAllocator : public Allocator {
return data; return data;
} }
void Delete(void* data) override { free(data); } void Delete(void *data) override { free(data); }
void *Map(void *buffer, size_t nbytes) { return buffer; }
void CopyBytes(void* dst, const void* src, size_t size) override { void Unmap(void *buffer, void *mapper_ptr) {}
memcpy(dst, src, size); bool OnHost() { return true; }
}
}; };
// Get the CPU Alloctor. std::map<int32_t, Allocator *> *gAllocatorRegistry();
CPUAllocator* cpu_allocator();
// Sets the CPU allocator to the given allocator: the caller gives away the
// ownership of the pointer.
void SetCPUAllocator(CPUAllocator* alloc);
template <DeviceType D> Allocator *GetDeviceAllocator(DeviceType type);
struct DeviceContext {};
template <>
struct DeviceContext<DeviceType::CPU> {
static Allocator* allocator() { return cpu_allocator(); }
};
template <> struct AllocatorRegisterer {
struct DeviceContext<DeviceType::NEON> { explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
static Allocator* allocator() { return cpu_allocator(); } if (gAllocatorRegistry()->count(type)) {
LOG(ERROR) << "Allocator for device type " << type
<< " registered twice. This should not happen."
<< gAllocatorRegistry()->count(type);
std::exit(1);
}
gAllocatorRegistry()->emplace(type, alloc);
}
}; };
Allocator* GetDeviceAllocator(DeviceType type); #define MACE_REGISTER_ALLOCATOR(type, alloc) \
namespace { \
static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
}
} // namespace mace } // namespace mace
......
...@@ -6,31 +6,37 @@ ...@@ -6,31 +6,37 @@
namespace mace { namespace mace {
std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() { std::map<int32_t, OperatorRegistry *> *gDeviceTypeRegistry() {
static std::map<int32_t, OperatorRegistry*> g_device_type_registry; static std::map<int32_t, OperatorRegistry *> g_device_type_registry;
return &g_device_type_registry; return &g_device_type_registry;
} }
MACE_DEFINE_REGISTRY(CPUOperatorRegistry, MACE_DEFINE_REGISTRY(CPUOperatorRegistry,
OperatorBase, OperatorBase,
const OperatorDef&, const OperatorDef &,
Workspace*); Workspace *);
MACE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry); MACE_REGISTER_DEVICE_TYPE(DeviceType::CPU, CPUOperatorRegistry);
MACE_DEFINE_REGISTRY(NEONOperatorRegistry, MACE_DEFINE_REGISTRY(NEONOperatorRegistry,
OperatorBase, OperatorBase,
const OperatorDef&, const OperatorDef &,
Workspace*); Workspace *);
MACE_REGISTER_DEVICE_TYPE(DeviceType::NEON, NEONOperatorRegistry); MACE_REGISTER_DEVICE_TYPE(DeviceType::NEON, NEONOperatorRegistry);
unique_ptr<OperatorBase> CreateOperator(const OperatorDef& operator_def, MACE_DEFINE_REGISTRY(OPENCLOperatorRegistry,
Workspace* ws, OperatorBase,
const OperatorDef &,
Workspace *);
MACE_REGISTER_DEVICE_TYPE(DeviceType::OPENCL, OPENCLOperatorRegistry);
unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws,
DeviceType type) { DeviceType type) {
OperatorRegistry* registry = gDeviceTypeRegistry()->at(type); OperatorRegistry *registry = gDeviceTypeRegistry()->at(type);
return registry->Create(operator_def.type(), operator_def, ws); return registry->Create(operator_def.type(), operator_def, ws);
} }
OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws) OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
: operator_ws_(ws), : operator_ws_(ws),
operator_def_(std::make_shared<OperatorDef>(operator_def)) {} operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
......
...@@ -92,7 +92,7 @@ class Operator : public OperatorBase { ...@@ -92,7 +92,7 @@ class Operator : public OperatorBase {
for (const string &output_str : operator_def.output()) { for (const string &output_str : operator_def.output()) {
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, DeviceContext<D>::allocator(), DataTypeToEnum<T>::v()))); output_str, GetDeviceAllocator(D), DataTypeToEnum<T>::v())));
} }
} }
virtual bool Run() override = 0; virtual bool Run() override = 0;
...@@ -160,6 +160,16 @@ MACE_DECLARE_REGISTRY(NEONOperatorRegistry, ...@@ -160,6 +160,16 @@ MACE_DECLARE_REGISTRY(NEONOperatorRegistry,
#define REGISTER_NEON_OPERATOR(name, ...) \ #define REGISTER_NEON_OPERATOR(name, ...) \
MACE_REGISTER_CLASS(NEONOperatorRegistry, name, __VA_ARGS__) MACE_REGISTER_CLASS(NEONOperatorRegistry, name, __VA_ARGS__)
MACE_DECLARE_REGISTRY(OPENCLOperatorRegistry,
OperatorBase,
const OperatorDef &,
Workspace *);
#define REGISTER_OPENCL_OPERATOR_CREATOR(key, ...) \
MACE_REGISTER_CREATOR(OPENCLOperatorRegistry, key, __VA_ARGS__)
#define REGISTER_OPENCL_OPERATOR(name, ...) \
MACE_REGISTER_CLASS(OPENCLOperatorRegistry, name, __VA_ARGS__)
unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def, unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws, Workspace *ws,
DeviceType type); DeviceType type);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#define CL_HPP_MINIMUM_OPENCL_VERSION 200
#define CL_HPP_TARGET_OPENCL_VERSION 200
#include "mace/core/logging.h"
#include "mace/core/platform/opencl/cl2.hpp"
#include "mace/core/platform/opencl/opencl_wrapper.h"
int main() {
LOG(INFO) << "OpenCL support: " << mace::OpenCLSupported();
if (!mace::OpenCLSupported()) return 1;
LOG(INFO) << "Start OpenCL test";
// get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(INFO) << " No OpenCL platforms found";
return 1;
}
LOG(INFO) << "Platform sizes: " << all_platforms.size();
cl::Platform default_platform = all_platforms[0];
LOG(INFO) << "Using platform: "
<< default_platform.getInfo<CL_PLATFORM_NAME>() << ", "
<< default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(INFO) << "No OpenCL devices found";
return 1;
}
// Use the last device
cl::Device default_device = *all_devices.rbegin();
LOG(INFO) << "Using device: " << default_device.getInfo<CL_DEVICE_NAME>()
<< ", " << default_device.getInfo<CL_DEVICE_TYPE>();
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({default_device});
// create the program that we want to execute on the device
cl::Program::Sources sources;
// calculates for each element; C = A + B
std::string kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C, "
" global const int* N) {"
" int ID, Nthreads, n, ratio, start, stop;"
""
" ID = get_global_id(0);"
" Nthreads = get_global_size(0);"
" n = N[0];"
""
" ratio = (n / Nthreads);" // number of elements for each thread
" start = ratio * ID;"
" stop = ratio * (ID + 1);"
""
" for (int i=start; i<stop; i++)"
" C[i] = A[i] + B[i];"
" }";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
cl::Program program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
LOG(INFO) << "Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device);
return 1;
}
// apparently OpenCL only likes arrays ...
// N holds the number of elements in the vectors we want to add
int N[1] = {1000};
int n = N[0];
// create buffers on device (allocate space on GPU)
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
cl::Buffer buffer_N(context, CL_MEM_READ_ONLY, sizeof(int));
// create things on here (CPU)
int A[n], B[n];
for (int i = 0; i < n; i++) {
A[i] = i;
B[i] = 2 * i;
}
// create a queue (a queue of commands that the GPU will execute)
cl::CommandQueue queue(context, default_device);
// push write commands to queue
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * n, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * n, B);
queue.enqueueWriteBuffer(buffer_N, CL_TRUE, 0, sizeof(int), N);
auto simple_add =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(
program, "simple_add");
cl_int error;
simple_add(cl::EnqueueArgs(queue, cl::NDRange(100), cl::NDRange(10)),
buffer_A, buffer_B, buffer_C, buffer_N, error);
if (error != 0) {
LOG(ERROR) << "Failed to execute kernel " << error;
}
int C[n];
// read result from GPU to here
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * n, C);
bool correct = true;
for (int i = 0; i < n; i++) {
if (C[i] != A[i] + B[i]) correct = false;
}
LOG(INFO) << "OpenCL test result: " << (correct ? "correct" : "incorrect");
return 0;
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "CL/opencl.h"
#include "mace/core/logging.h"
#include <dlfcn.h>
#include <mutex>
namespace mace {
bool OpenCLSupported();
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/runtime/opencl/cl2.hpp"
namespace mace {
OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {}
void *OpenCLAllocator::New(size_t nbytes) {
cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Get()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
nbytes, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
MACE_CHECK_NOTNULL(buffer);
return static_cast<void *>(buffer);
}
void OpenCLAllocator::Delete(void *buffer) {
if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
delete cl_buffer;
}
}
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
// TODO (heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr;
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
nullptr) == CL_SUCCESS);
}
bool OpenCLAllocator::OnHost() { return false; }
MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#include "mace/core/allocator.h"
namespace mace {
class OpenCLAllocator : public Allocator {
public:
OpenCLAllocator();
~OpenCLAllocator() override;
void *New(size_t nbytes) override;
void Delete(void *buffer) override;
void *Map(void *buffer, size_t nbytes) override;
void Unmap(void *buffer, void *mapped_ptr) override;
bool OnHost() override;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <cstdlib>
#include <fstream>
#include <mutex>
#include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
namespace mace {
namespace {
bool ReadSourceFile(const char *filename, std::string *content) {
MACE_CHECK_NOTNULL(filename);
MACE_CHECK_NOTNULL(content);
*content = "";
std::ifstream ifs(filename, std::ifstream::in);
if (!ifs.is_open()) {
LOG(ERROR) << "Failed to open file " << filename;
return false;
}
std::string line;
while (std::getline(ifs, line)) {
*content += line;
}
ifs.close();
return true;
}
bool BuildProgram(OpenCLRuntime *runtime, const char *filename, cl::Program *program) {
MACE_CHECK_NOTNULL(filename);
MACE_CHECK_NOTNULL(program);
std::string kernel_code;
if (!ReadSourceFile(filename, &kernel_code)) {
LOG(ERROR) << "Failed to read kernel source " << filename;
return false;
}
cl::Program::Sources sources;
sources.push_back({kernel_code.c_str(), kernel_code.length()});
*program = cl::Program(runtime->context(), sources);
if (program->build({runtime->device()}) != CL_SUCCESS) {
LOG(INFO) << "Error building: "
<< program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
return false;
}
return true;
}
} // namespace
OpenCLRuntime *OpenCLRuntime::Get() {
static std::once_flag init_once;
static OpenCLRuntime *instance = nullptr;
std::call_once(init_once, []() {
if (!mace::OpenCLLibrary::Supported()) {
LOG(ERROR) << "OpenCL not supported";
return;
}
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
LOG(ERROR) << "No OpenCL platforms found";
return;
}
cl::Platform default_platform = all_platforms[0];
VLOG(1) << "Using platform: "
<< default_platform.getInfo<CL_PLATFORM_NAME>() << ", "
<< default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
<< default_platform.getInfo<CL_PLATFORM_VERSION>();
// get default device (CPUs, GPUs) of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
LOG(ERROR) << "No OpenCL devices found";
return;
}
bool gpu_detected = false;
cl::Device gpu_device;
for (auto device : all_devices) {
if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
gpu_device = device;
gpu_detected = true;
VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
break;
}
}
if (!gpu_detected) {
LOG(ERROR) << "No GPU device found";
return;
}
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device);
instance = new OpenCLRuntime(context, gpu_device, command_queue);
});
return instance;
}
OpenCLRuntime::OpenCLRuntime(cl::Context context,
cl::Device device,
cl::CommandQueue command_queue)
: context_(context), device_(device), command_queue_(command_queue) {}
OpenCLRuntime::~OpenCLRuntime() {}
cl::Context &OpenCLRuntime::context() { return context_; }
cl::Device &OpenCLRuntime::device() { return device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::Program OpenCLRuntime::GetProgram(const std::string &name) {
static const char *kernel_source_path = getenv("MACE_KERNEL_SOURCE_PATH");
std::string filename = name;
if (kernel_source_path != nullptr) {
filename = kernel_source_path + name;
}
std::lock_guard<std::mutex> lock(program_lock_);
// TODO (heliangliang) Support binary format
auto iter = programs_.find(name);
if (iter != programs_.end()) {
return iter->second;
} else {
cl::Program program;
MACE_CHECK(BuildProgram(this, filename.c_str(), &program));
programs_.emplace(name, program);
return program;
}
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
#ifndef CL_HPP_TARGET_OPENCL_VERSION
#define CL_HPP_TARGET_OPENCL_VERSION 200
#endif
#include <map>
#include <mutex>
#include "mace/core/runtime/opencl/cl2.hpp"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
namespace mace {
class OpenCLRuntime {
public:
static OpenCLRuntime *Get();
OpenCLRuntime(cl::Context context,
cl::Device device,
cl::CommandQueue command_queue);
~OpenCLRuntime();
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
cl::Program GetProgram(const std::string &name);
private:
cl::Context context_;
cl::CommandQueue command_queue_;
cl::Device device_;
std::map<std::string, cl::Program> programs_;
std::mutex program_lock_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/logging.h"
#include "mace/core/operator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
int main() {
using namespace mace;
auto runtime = mace::OpenCLRuntime::Get();
mace::Tensor ta(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
mace::Tensor tb(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
mace::Tensor tc(GetDeviceAllocator(DeviceType::OPENCL), DataType::DT_INT32);
mace::Tensor tstep(GetDeviceAllocator(DeviceType::OPENCL),
DataType::DT_INT32);
int n = 1000;
std::vector<index_t> shape = {n};
ta.Resize(shape);
tb.Resize(shape);
tc.Resize(shape);
tstep.Resize({1});
int step_size = 10;
int global_size = n / step_size;
{
mace::Tensor::MappingGuard ta_mapper(&ta);
mace::Tensor::MappingGuard tb_mapper(&tb);
mace::Tensor::MappingGuard tstep_mapper(&tstep);
int32_t *a = ta.mutable_data<int32_t>();
int32_t *b = tb.mutable_data<int32_t>();
int32_t *step = tstep.mutable_data<int32_t>();
for (int i = 0; i < n; i++) {
a[i] = i;
b[i] = 2 * i;
}
step[0] = step_size;
}
auto program = runtime->GetProgram("simple_add.cl");
auto simple_add =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(
program, "simple_add");
cl_int error;
simple_add(cl::EnqueueArgs(runtime->command_queue(), cl::NDRange(global_size),
cl::NullRange),
*(static_cast<cl::Buffer *>(ta.buffer())),
*(static_cast<cl::Buffer *>(tb.buffer())),
*(static_cast<cl::Buffer *>(tc.buffer())),
*(static_cast<cl::Buffer *>(tstep.buffer())), error);
if (error != 0) {
LOG(ERROR) << "Failed to execute kernel " << error;
}
{
mace::Tensor::MappingGuard ta_mapper(&ta);
mace::Tensor::MappingGuard tb_mapper(&tb);
mace::Tensor::MappingGuard tc_mapper(&tc);
int32_t *a = ta.mutable_data<int32_t>();
int32_t *b = tb.mutable_data<int32_t>();
int32_t *c = tc.mutable_data<int32_t>();
bool correct = true;
for (int i = 0; i < n; i++) {
if (c[i] != a[i] + b[i]) correct = false;
}
LOG(INFO) << "OpenCL test result: " << (correct ? "correct" : "incorrect");
}
return 0;
}
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "CL/opencl.h" #include "CL/opencl.h"
#include "mace/core/logging.h" #include "mace/core/logging.h"
#include "mace/core/platform/opencl/opencl_wrapper.h" #include "mace/core/runtime/opencl/opencl_wrapper.h"
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex> #include <mutex>
...@@ -14,10 +14,14 @@ ...@@ -14,10 +14,14 @@
* Wrapper of OpenCL 2.0 (based on 1.2) * Wrapper of OpenCL 2.0 (based on 1.2)
*/ */
namespace mace { namespace mace {
class OpenCLStub final {
namespace {
class OpenCLLibraryImpl final {
public: public:
static OpenCLStub &Get(); static OpenCLLibraryImpl &Get();
bool loaded() { return loaded_; } bool Load();
void Unload();
bool loaded() { return handle_ != nullptr; }
using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
using clGetPlatformInfoFunc = using clGetPlatformInfoFunc =
...@@ -177,21 +181,23 @@ class OpenCLStub final { ...@@ -177,21 +181,23 @@ class OpenCLStub final {
#undef DEFINE_FUNC_PTR #undef DEFINE_FUNC_PTR
private: private:
bool TryLoadAll(); void *LoadFromPath(const std::string &path);
bool Load(const std::string &library); void *handle_ = nullptr;
bool loaded_ = false;
}; };
OpenCLStub &OpenCLStub::Get() { OpenCLLibraryImpl &OpenCLLibraryImpl::Get() {
static std::once_flag load_once; static std::once_flag load_once;
static OpenCLStub instance; static OpenCLLibraryImpl instance;
std::call_once(load_once, []() { instance.TryLoadAll(); }); std::call_once(load_once, []() { instance.Load(); });
return instance; return instance;
} }
bool OpenCLStub::TryLoadAll() { bool OpenCLLibraryImpl::Load() {
if (loaded()) return true;
// TODO (heliangliang) Make this configurable // TODO (heliangliang) Make this configurable
static const std::vector<std::string> pathes = { // TODO (heliangliang) Benchmark 64 bit overhead
static const std::vector<std::string> paths = {
#if defined(__aarch64__) #if defined(__aarch64__)
// Qualcomm Adreno // Qualcomm Adreno
"/system/vendor/lib64/libOpenCL.so", "/system/vendor/lib64/libOpenCL.so",
...@@ -209,9 +215,11 @@ bool OpenCLStub::TryLoadAll() { ...@@ -209,9 +215,11 @@ bool OpenCLStub::TryLoadAll() {
#endif #endif
}; };
for (const auto &path : pathes) { for (const auto &path : paths) {
VLOG(2) << "Loading OpenCL from " << path; VLOG(2) << "Loading OpenCL from " << path;
if (Load(path)) { void *handle = LoadFromPath(path);
if (handle != nullptr) {
handle_ = handle;
return true; return true;
} }
} }
...@@ -220,13 +228,22 @@ bool OpenCLStub::TryLoadAll() { ...@@ -220,13 +228,22 @@ bool OpenCLStub::TryLoadAll() {
return false; return false;
} }
bool OpenCLStub::Load(const std::string &path) { void OpenCLLibraryImpl::Unload() {
if (handle_ != nullptr) {
if (dlclose(handle_) != 0) {
LOG(ERROR) << "dlclose failed for OpenCL library";
}
handle_ = nullptr;
}
}
void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
void *handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL); void *handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL);
if (handle == nullptr) { if (handle == nullptr) {
VLOG(2) << "Failed to load OpenCL library from path " << path VLOG(2) << "Failed to load OpenCL library from path " << path
<< " error code: " << dlerror(); << " error code: " << dlerror();
return false; return nullptr;
} }
#define ASSIGN_FROM_DLSYM(func) \ #define ASSIGN_FROM_DLSYM(func) \
...@@ -234,9 +251,8 @@ bool OpenCLStub::Load(const std::string &path) { ...@@ -234,9 +251,8 @@ bool OpenCLStub::Load(const std::string &path) {
void *ptr = dlsym(handle, #func); \ void *ptr = dlsym(handle, #func); \
if (ptr == nullptr) { \ if (ptr == nullptr) { \
LOG(ERROR) << "Failed to load " << #func << " from " << path; \ LOG(ERROR) << "Failed to load " << #func << " from " << path; \
loaded_ = false; \
dlclose(handle); \ dlclose(handle); \
return false; \ return nullptr; \
} \ } \
func = reinterpret_cast<func##Func>(ptr); \ func = reinterpret_cast<func##Func>(ptr); \
VLOG(2) << "Loaded " << #func << " from " << path; \ VLOG(2) << "Loaded " << #func << " from " << path; \
...@@ -283,19 +299,22 @@ bool OpenCLStub::Load(const std::string &path) { ...@@ -283,19 +299,22 @@ bool OpenCLStub::Load(const std::string &path) {
#undef ASSIGN_FROM_DLSYM #undef ASSIGN_FROM_DLSYM
loaded_ = true; return handle;
// TODO (heliangliang) Call dlclose if we are dlclosed
return true;
} }
} // namespace
bool OpenCLLibrary::Supported() { return OpenCLLibraryImpl::Get().loaded(); }
void OpenCLLibrary::Load() { OpenCLLibraryImpl::Get().Load(); }
bool OpenCLSupported() { return OpenCLStub::Get().loaded(); } void OpenCLLibrary::Unload() { OpenCLLibraryImpl::Get().Unload(); }
} // namespace mace } // namespace mace
cl_int clGetPlatformIDs(cl_uint num_entries, cl_int clGetPlatformIDs(cl_uint num_entries,
cl_platform_id *platforms, cl_platform_id *platforms,
cl_uint *num_platforms) { cl_uint *num_platforms) {
auto func = mace::OpenCLStub::Get().clGetPlatformIDs; auto func = mace::OpenCLLibraryImpl::Get().clGetPlatformIDs;
if (func != nullptr) { if (func != nullptr) {
return func(num_entries, platforms, num_platforms); return func(num_entries, platforms, num_platforms);
} else { } else {
...@@ -307,7 +326,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform, ...@@ -307,7 +326,7 @@ cl_int clGetPlatformInfo(cl_platform_id platform,
size_t param_value_size, size_t param_value_size,
void *param_value, void *param_value,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLStub::Get().clGetPlatformInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetPlatformInfo;
if (func != nullptr) { if (func != nullptr) {
return func(platform, param_name, param_value_size, param_value, return func(platform, param_name, param_value_size, param_value,
param_value_size_ret); param_value_size_ret);
...@@ -323,7 +342,7 @@ cl_int clBuildProgram(cl_program program, ...@@ -323,7 +342,7 @@ cl_int clBuildProgram(cl_program program,
void(CL_CALLBACK *pfn_notify)(cl_program program, void(CL_CALLBACK *pfn_notify)(cl_program program,
void *user_data), void *user_data),
void *user_data) { void *user_data) {
auto func = mace::OpenCLStub::Get().clBuildProgram; auto func = mace::OpenCLLibraryImpl::Get().clBuildProgram;
if (func != nullptr) { if (func != nullptr) {
return func(program, num_devices, device_list, options, pfn_notify, return func(program, num_devices, device_list, options, pfn_notify,
user_data); user_data);
...@@ -341,7 +360,7 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, ...@@ -341,7 +360,7 @@ cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_uint num_events_in_wait_list, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event) { cl_event *event) {
auto func = mace::OpenCLStub::Get().clEnqueueNDRangeKernel; auto func = mace::OpenCLLibraryImpl::Get().clEnqueueNDRangeKernel;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue, kernel, work_dim, global_work_offset, return func(command_queue, kernel, work_dim, global_work_offset,
global_work_size, local_work_size, num_events_in_wait_list, global_work_size, local_work_size, num_events_in_wait_list,
...@@ -355,7 +374,7 @@ cl_int clSetKernelArg(cl_kernel kernel, ...@@ -355,7 +374,7 @@ cl_int clSetKernelArg(cl_kernel kernel,
cl_uint arg_index, cl_uint arg_index,
size_t arg_size, size_t arg_size,
const void *arg_value) { const void *arg_value) {
auto func = mace::OpenCLStub::Get().clSetKernelArg; auto func = mace::OpenCLLibraryImpl::Get().clSetKernelArg;
if (func != nullptr) { if (func != nullptr) {
return func(kernel, arg_index, arg_size, arg_value); return func(kernel, arg_index, arg_size, arg_value);
} else { } else {
...@@ -364,7 +383,7 @@ cl_int clSetKernelArg(cl_kernel kernel, ...@@ -364,7 +383,7 @@ cl_int clSetKernelArg(cl_kernel kernel,
} }
cl_int clRetainMemObject(cl_mem memobj) { cl_int clRetainMemObject(cl_mem memobj) {
auto func = mace::OpenCLStub::Get().clRetainMemObject; auto func = mace::OpenCLLibraryImpl::Get().clRetainMemObject;
if (func != nullptr) { if (func != nullptr) {
return func(memobj); return func(memobj);
} else { } else {
...@@ -373,7 +392,7 @@ cl_int clRetainMemObject(cl_mem memobj) { ...@@ -373,7 +392,7 @@ cl_int clRetainMemObject(cl_mem memobj) {
} }
cl_int clReleaseMemObject(cl_mem memobj) { cl_int clReleaseMemObject(cl_mem memobj) {
auto func = mace::OpenCLStub::Get().clReleaseMemObject; auto func = mace::OpenCLLibraryImpl::Get().clReleaseMemObject;
if (func != nullptr) { if (func != nullptr) {
return func(memobj); return func(memobj);
} else { } else {
...@@ -387,7 +406,7 @@ cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, ...@@ -387,7 +406,7 @@ cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
cl_uint num_events_in_wait_list, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event) { cl_event *event) {
auto func = mace::OpenCLStub::Get().clEnqueueUnmapMemObject; auto func = mace::OpenCLLibraryImpl::Get().clEnqueueUnmapMemObject;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
event_wait_list, event); event_wait_list, event);
...@@ -397,7 +416,7 @@ cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue, ...@@ -397,7 +416,7 @@ cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
} }
cl_int clRetainCommandQueue(cl_command_queue command_queue) { cl_int clRetainCommandQueue(cl_command_queue command_queue) {
auto func = mace::OpenCLStub::Get().clRetainCommandQueue; auto func = mace::OpenCLLibraryImpl::Get().clRetainCommandQueue;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue); return func(command_queue);
} else { } else {
...@@ -411,7 +430,7 @@ cl_context clCreateContext( ...@@ -411,7 +430,7 @@ cl_context clCreateContext(
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, void *user_data,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateContext; auto func = mace::OpenCLLibraryImpl::Get().clCreateContext;
if (func != nullptr) { if (func != nullptr) {
return func(properties, num_devices, devices, pfn_notify, user_data, return func(properties, num_devices, devices, pfn_notify, user_data,
errcode_ret); errcode_ret);
...@@ -425,7 +444,7 @@ cl_context clCreateContextFromType( ...@@ -425,7 +444,7 @@ cl_context clCreateContextFromType(
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, void *user_data,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateContextFromType; auto func = mace::OpenCLLibraryImpl::Get().clCreateContextFromType;
if (func != nullptr) { if (func != nullptr) {
return func(properties, device_type, pfn_notify, user_data, errcode_ret); return func(properties, device_type, pfn_notify, user_data, errcode_ret);
} else { } else {
...@@ -434,7 +453,7 @@ cl_context clCreateContextFromType( ...@@ -434,7 +453,7 @@ cl_context clCreateContextFromType(
} }
cl_int clReleaseContext(cl_context context) { cl_int clReleaseContext(cl_context context) {
auto func = mace::OpenCLStub::Get().clReleaseContext; auto func = mace::OpenCLLibraryImpl::Get().clReleaseContext;
if (func != nullptr) { if (func != nullptr) {
return func(context); return func(context);
} else { } else {
...@@ -443,7 +462,7 @@ cl_int clReleaseContext(cl_context context) { ...@@ -443,7 +462,7 @@ cl_int clReleaseContext(cl_context context) {
} }
cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) {
auto func = mace::OpenCLStub::Get().clWaitForEvents; auto func = mace::OpenCLLibraryImpl::Get().clWaitForEvents;
if (func != nullptr) { if (func != nullptr) {
return func(num_events, event_list); return func(num_events, event_list);
} else { } else {
...@@ -452,7 +471,7 @@ cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { ...@@ -452,7 +471,7 @@ cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) {
} }
cl_int clReleaseEvent(cl_event event) { cl_int clReleaseEvent(cl_event event) {
auto func = mace::OpenCLStub::Get().clReleaseEvent; auto func = mace::OpenCLLibraryImpl::Get().clReleaseEvent;
if (func != nullptr) { if (func != nullptr) {
return func(event); return func(event);
} else { } else {
...@@ -469,7 +488,7 @@ cl_int clEnqueueWriteBuffer(cl_command_queue command_queue, ...@@ -469,7 +488,7 @@ cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
cl_uint num_events_in_wait_list, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event) { cl_event *event) {
auto func = mace::OpenCLStub::Get().clEnqueueWriteBuffer; auto func = mace::OpenCLLibraryImpl::Get().clEnqueueWriteBuffer;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue, buffer, blocking_write, offset, size, ptr, return func(command_queue, buffer, blocking_write, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event); num_events_in_wait_list, event_wait_list, event);
...@@ -487,7 +506,7 @@ cl_int clEnqueueReadBuffer(cl_command_queue command_queue, ...@@ -487,7 +506,7 @@ cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
cl_uint num_events_in_wait_list, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event) { cl_event *event) {
auto func = mace::OpenCLStub::Get().clEnqueueReadBuffer; auto func = mace::OpenCLLibraryImpl::Get().clEnqueueReadBuffer;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue, buffer, blocking_read, offset, size, ptr, return func(command_queue, buffer, blocking_read, offset, size, ptr,
num_events_in_wait_list, event_wait_list, event); num_events_in_wait_list, event_wait_list, event);
...@@ -502,7 +521,7 @@ cl_int clGetProgramBuildInfo(cl_program program, ...@@ -502,7 +521,7 @@ cl_int clGetProgramBuildInfo(cl_program program,
size_t param_value_size, size_t param_value_size,
void *param_value, void *param_value,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLStub::Get().clGetProgramBuildInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetProgramBuildInfo;
if (func != nullptr) { if (func != nullptr) {
return func(program, device, param_name, param_value_size, param_value, return func(program, device, param_name, param_value_size, param_value,
param_value_size_ret); param_value_size_ret);
...@@ -512,7 +531,7 @@ cl_int clGetProgramBuildInfo(cl_program program, ...@@ -512,7 +531,7 @@ cl_int clGetProgramBuildInfo(cl_program program,
} }
cl_int clRetainProgram(cl_program program) { cl_int clRetainProgram(cl_program program) {
auto func = mace::OpenCLStub::Get().clRetainProgram; auto func = mace::OpenCLLibraryImpl::Get().clRetainProgram;
if (func != nullptr) { if (func != nullptr) {
return func(program); return func(program);
} else { } else {
...@@ -530,7 +549,7 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue, ...@@ -530,7 +549,7 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue,
const cl_event *event_wait_list, const cl_event *event_wait_list,
cl_event *event, cl_event *event,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clEnqueueMapBuffer; auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapBuffer;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue, buffer, blocking_map, map_flags, offset, size, return func(command_queue, buffer, blocking_map, map_flags, offset, size,
num_events_in_wait_list, event_wait_list, event, errcode_ret); num_events_in_wait_list, event_wait_list, event, errcode_ret);
...@@ -546,7 +565,7 @@ cl_command_queue clCreateCommandQueueWithProperties( ...@@ -546,7 +565,7 @@ cl_command_queue clCreateCommandQueueWithProperties(
cl_device_id device, cl_device_id device,
const cl_queue_properties *properties, const cl_queue_properties *properties,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateCommandQueueWithProperties; auto func = mace::OpenCLLibraryImpl::Get().clCreateCommandQueueWithProperties;
if (func != nullptr) { if (func != nullptr) {
return func(context, device, properties, errcode_ret); return func(context, device, properties, errcode_ret);
} else { } else {
...@@ -555,7 +574,7 @@ cl_command_queue clCreateCommandQueueWithProperties( ...@@ -555,7 +574,7 @@ cl_command_queue clCreateCommandQueueWithProperties(
} }
cl_int clReleaseCommandQueue(cl_command_queue command_queue) { cl_int clReleaseCommandQueue(cl_command_queue command_queue) {
auto func = mace::OpenCLStub::Get().clReleaseCommandQueue; auto func = mace::OpenCLLibraryImpl::Get().clReleaseCommandQueue;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue); return func(command_queue);
} else { } else {
...@@ -570,7 +589,7 @@ cl_program clCreateProgramWithBinary(cl_context context, ...@@ -570,7 +589,7 @@ cl_program clCreateProgramWithBinary(cl_context context,
const unsigned char **binaries, const unsigned char **binaries,
cl_int *binary_status, cl_int *binary_status,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateProgramWithBinary; auto func = mace::OpenCLLibraryImpl::Get().clCreateProgramWithBinary;
if (func != nullptr) { if (func != nullptr) {
return func(context, num_devices, device_list, lengths, binaries, return func(context, num_devices, device_list, lengths, binaries,
binary_status, errcode_ret); binary_status, errcode_ret);
...@@ -583,7 +602,7 @@ cl_program clCreateProgramWithBinary(cl_context context, ...@@ -583,7 +602,7 @@ cl_program clCreateProgramWithBinary(cl_context context,
} }
cl_int clRetainContext(cl_context context) { cl_int clRetainContext(cl_context context) {
auto func = mace::OpenCLStub::Get().clRetainContext; auto func = mace::OpenCLLibraryImpl::Get().clRetainContext;
if (func != nullptr) { if (func != nullptr) {
return func(context); return func(context);
} else { } else {
...@@ -596,7 +615,7 @@ cl_int clGetContextInfo(cl_context context, ...@@ -596,7 +615,7 @@ cl_int clGetContextInfo(cl_context context,
size_t param_value_size, size_t param_value_size,
void *param_value, void *param_value,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLStub::Get().clGetContextInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetContextInfo;
if (func != nullptr) { if (func != nullptr) {
return func(context, param_name, param_value_size, param_value, return func(context, param_name, param_value_size, param_value,
param_value_size_ret); param_value_size_ret);
...@@ -606,7 +625,7 @@ cl_int clGetContextInfo(cl_context context, ...@@ -606,7 +625,7 @@ cl_int clGetContextInfo(cl_context context,
} }
cl_int clReleaseProgram(cl_program program) { cl_int clReleaseProgram(cl_program program) {
auto func = mace::OpenCLStub::Get().clReleaseProgram; auto func = mace::OpenCLLibraryImpl::Get().clReleaseProgram;
if (func != nullptr) { if (func != nullptr) {
return func(program); return func(program);
} else { } else {
...@@ -615,7 +634,7 @@ cl_int clReleaseProgram(cl_program program) { ...@@ -615,7 +634,7 @@ cl_int clReleaseProgram(cl_program program) {
} }
cl_int clFlush(cl_command_queue command_queue) { cl_int clFlush(cl_command_queue command_queue) {
auto func = mace::OpenCLStub::Get().clFlush; auto func = mace::OpenCLLibraryImpl::Get().clFlush;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue); return func(command_queue);
} else { } else {
...@@ -624,7 +643,7 @@ cl_int clFlush(cl_command_queue command_queue) { ...@@ -624,7 +643,7 @@ cl_int clFlush(cl_command_queue command_queue) {
} }
cl_int clFinish(cl_command_queue command_queue) { cl_int clFinish(cl_command_queue command_queue) {
auto func = mace::OpenCLStub::Get().clFinish; auto func = mace::OpenCLLibraryImpl::Get().clFinish;
if (func != nullptr) { if (func != nullptr) {
return func(command_queue); return func(command_queue);
} else { } else {
...@@ -637,7 +656,7 @@ cl_int clGetProgramInfo(cl_program program, ...@@ -637,7 +656,7 @@ cl_int clGetProgramInfo(cl_program program,
size_t param_value_size, size_t param_value_size,
void *param_value, void *param_value,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLStub::Get().clGetProgramInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetProgramInfo;
if (func != nullptr) { if (func != nullptr) {
return func(program, param_name, param_value_size, param_value, return func(program, param_name, param_value_size, param_value,
param_value_size_ret); param_value_size_ret);
...@@ -649,7 +668,7 @@ cl_int clGetProgramInfo(cl_program program, ...@@ -649,7 +668,7 @@ cl_int clGetProgramInfo(cl_program program,
cl_kernel clCreateKernel(cl_program program, cl_kernel clCreateKernel(cl_program program,
const char *kernel_name, const char *kernel_name,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateKernel; auto func = mace::OpenCLLibraryImpl::Get().clCreateKernel;
if (func != nullptr) { if (func != nullptr) {
return func(program, kernel_name, errcode_ret); return func(program, kernel_name, errcode_ret);
} else { } else {
...@@ -661,7 +680,7 @@ cl_kernel clCreateKernel(cl_program program, ...@@ -661,7 +680,7 @@ cl_kernel clCreateKernel(cl_program program,
} }
cl_int clRetainKernel(cl_kernel kernel) { cl_int clRetainKernel(cl_kernel kernel) {
auto func = mace::OpenCLStub::Get().clRetainKernel; auto func = mace::OpenCLLibraryImpl::Get().clRetainKernel;
if (func != nullptr) { if (func != nullptr) {
return func(kernel); return func(kernel);
} else { } else {
...@@ -674,7 +693,7 @@ cl_mem clCreateBuffer(cl_context context, ...@@ -674,7 +693,7 @@ cl_mem clCreateBuffer(cl_context context,
size_t size, size_t size,
void *host_ptr, void *host_ptr,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateBuffer; auto func = mace::OpenCLLibraryImpl::Get().clCreateBuffer;
if (func != nullptr) { if (func != nullptr) {
return func(context, flags, size, host_ptr, errcode_ret); return func(context, flags, size, host_ptr, errcode_ret);
} else { } else {
...@@ -690,7 +709,7 @@ cl_program clCreateProgramWithSource(cl_context context, ...@@ -690,7 +709,7 @@ cl_program clCreateProgramWithSource(cl_context context,
const char **strings, const char **strings,
const size_t *lengths, const size_t *lengths,
cl_int *errcode_ret) { cl_int *errcode_ret) {
auto func = mace::OpenCLStub::Get().clCreateProgramWithSource; auto func = mace::OpenCLLibraryImpl::Get().clCreateProgramWithSource;
if (func != nullptr) { if (func != nullptr) {
return func(context, count, strings, lengths, errcode_ret); return func(context, count, strings, lengths, errcode_ret);
} else { } else {
...@@ -702,7 +721,7 @@ cl_program clCreateProgramWithSource(cl_context context, ...@@ -702,7 +721,7 @@ cl_program clCreateProgramWithSource(cl_context context,
} }
cl_int clReleaseKernel(cl_kernel kernel) { cl_int clReleaseKernel(cl_kernel kernel) {
auto func = mace::OpenCLStub::Get().clReleaseKernel; auto func = mace::OpenCLLibraryImpl::Get().clReleaseKernel;
if (func != nullptr) { if (func != nullptr) {
return func(kernel); return func(kernel);
} else { } else {
...@@ -715,7 +734,7 @@ cl_int clGetDeviceIDs(cl_platform_id platform, ...@@ -715,7 +734,7 @@ cl_int clGetDeviceIDs(cl_platform_id platform,
cl_uint num_entries, cl_uint num_entries,
cl_device_id *devices, cl_device_id *devices,
cl_uint *num_devices) { cl_uint *num_devices) {
auto func = mace::OpenCLStub::Get().clGetDeviceIDs; auto func = mace::OpenCLLibraryImpl::Get().clGetDeviceIDs;
if (func != nullptr) { if (func != nullptr) {
return func(platform, device_type, num_entries, devices, num_devices); return func(platform, device_type, num_entries, devices, num_devices);
} else { } else {
...@@ -728,7 +747,7 @@ cl_int clGetDeviceInfo(cl_device_id device, ...@@ -728,7 +747,7 @@ cl_int clGetDeviceInfo(cl_device_id device,
size_t param_value_size, size_t param_value_size,
void *param_value, void *param_value,
size_t *param_value_size_ret) { size_t *param_value_size_ret) {
auto func = mace::OpenCLStub::Get().clGetDeviceInfo; auto func = mace::OpenCLLibraryImpl::Get().clGetDeviceInfo;
if (func != nullptr) { if (func != nullptr) {
return func(device, param_name, param_value_size, param_value, return func(device, param_name, param_value_size, param_value,
param_value_size_ret); param_value_size_ret);
...@@ -738,7 +757,7 @@ cl_int clGetDeviceInfo(cl_device_id device, ...@@ -738,7 +757,7 @@ cl_int clGetDeviceInfo(cl_device_id device,
} }
cl_int clRetainDevice(cl_device_id device) { cl_int clRetainDevice(cl_device_id device) {
auto func = mace::OpenCLStub::Get().clRetainDevice; auto func = mace::OpenCLLibraryImpl::Get().clRetainDevice;
if (func != nullptr) { if (func != nullptr) {
return func(device); return func(device);
} else { } else {
...@@ -747,7 +766,7 @@ cl_int clRetainDevice(cl_device_id device) { ...@@ -747,7 +766,7 @@ cl_int clRetainDevice(cl_device_id device) {
} }
cl_int clReleaseDevice(cl_device_id device) { cl_int clReleaseDevice(cl_device_id device) {
auto func = mace::OpenCLStub::Get().clReleaseDevice; auto func = mace::OpenCLLibraryImpl::Get().clReleaseDevice;
if (func != nullptr) { if (func != nullptr) {
return func(device); return func(device);
} else { } else {
...@@ -756,7 +775,7 @@ cl_int clReleaseDevice(cl_device_id device) { ...@@ -756,7 +775,7 @@ cl_int clReleaseDevice(cl_device_id device) {
} }
cl_int clRetainEvent(cl_event event) { cl_int clRetainEvent(cl_event event) {
auto func = mace::OpenCLStub::Get().clRetainEvent; auto func = mace::OpenCLLibraryImpl::Get().clRetainEvent;
if (func != nullptr) { if (func != nullptr) {
return func(event); return func(event);
} else { } else {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
namespace mace {
class OpenCLLibrary {
public:
static bool Supported();
static void Load();
static void Unload();
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
...@@ -48,20 +48,30 @@ namespace mace { ...@@ -48,20 +48,30 @@ namespace mace {
class Tensor { class Tensor {
public: public:
Tensor() Tensor()
: alloc_(cpu_allocator()), size_(0), dtype_(DT_FLOAT), data_(nullptr){}; : alloc_(GetDeviceAllocator(DeviceType::CPU)),
size_(0),
Tensor(Allocator* a, DataType type) dtype_(DT_FLOAT),
: alloc_(a), size_(0), dtype_(type), data_(nullptr){}; buffer_(nullptr),
data_(nullptr){};
Tensor(Allocator *alloc, DataType type)
: alloc_(alloc),
size_(0),
dtype_(type),
buffer_(nullptr),
data_(nullptr){};
~Tensor() { ~Tensor() {
if (alloc_ && data_.get()) { MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy");
data_.reset(); if (buffer_ != nullptr) {
MACE_CHECK_NOTNULL(alloc_);
alloc_->Delete(buffer_);
} }
} }
inline DataType dtype() const { return dtype_; } inline DataType dtype() const { return dtype_; }
inline const vector<index_t>& shape() const { return shape_; } inline const vector<index_t> &shape() const { return shape_; }
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
...@@ -72,69 +82,93 @@ class Tensor { ...@@ -72,69 +82,93 @@ class Tensor {
inline index_t size() const { return size_; } inline index_t size() const { return size_; }
inline const void* raw_data() const { inline const bool OnHost() const { return alloc_->OnHost(); }
MACE_CHECK(data_.get() || size_ == 0);
return data_.get();
}
template <typename T> /*
inline const T* data() const { * Map the device buffer as CPU buffer to access the data, unmap must be
MACE_CHECK( * called later
data_.get() || size_ == 0, */
"The tensor is of non-zero shape, but its data is not allocated yet. "); inline void Map() {
return static_cast<T*>(data_.get()); if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->Map(buffer_, size_ * SizeOfType());
}
} }
inline void* raw_mutable_data() { /*
if (data_.get() || size_ == 0) { * Unmap the device buffer
return data_.get(); */
} else { inline void Unmap() {
CASES(dtype_, data_.reset(alloc_->New(size_ * sizeof(T)), if (!OnHost()) {
[this](void* ptr) { alloc_->Delete(ptr); })); MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
return data_.get(); alloc_->Unmap(buffer_, data_);
data_ = nullptr;
}
} }
void *buffer() const { return buffer_; }
inline const void *raw_data() const {
void *data = MappedBuffer();
MACE_CHECK(data != nullptr || size_ == 0,
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
} }
template <typename T> template <typename T>
inline T* mutable_data() { inline const T *data() const {
if (size_ == 0 || data_.get()) { return static_cast<const T *>(raw_data());
return static_cast<T*>(data_.get());
} }
return static_cast<T*>(raw_mutable_data());
inline void *raw_mutable_data() {
void *data = MappedBuffer();
MACE_CHECK(data != nullptr || size_ == 0,
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
} }
inline void Resize(const vector<index_t>& shape) { template <typename T>
inline T *mutable_data() {
return static_cast<T *>(raw_mutable_data());
}
inline void Resize(const vector<index_t> &shape) {
shape_ = shape; shape_ = shape;
index_t size = NumElements(); index_t size = NumElements();
if (size_ != size) { if (size_ != size) {
size_ = size; size_ = size;
data_.reset(); MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
alloc_->Delete(buffer_);
CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T)));
} }
} }
inline void ResizeLike(const Tensor& other) { Resize(other.shape()); } inline void ResizeLike(const Tensor &other) { Resize(other.shape()); }
inline void ResizeLike(const Tensor* other) { Resize(other->shape()); } inline void ResizeLike(const Tensor *other) { Resize(other->shape()); }
template <typename T> template <typename T>
inline void Copy(const T* src, index_t size) { inline void Copy(const T *src, index_t size) {
MACE_CHECK(size == size_, "copy src and dst with different size."); MACE_CHECK(size == size_, "copy src and dst with different size.");
CopyBytes(static_cast<const void*>(src), sizeof(T) * size); CopyBytes(static_cast<const void *>(src), sizeof(T) * size);
} }
template <typename SrcType, typename DstType> template <typename SrcType, typename DstType>
inline void CopyWithCast(const SrcType* src, size_t size) { inline void CopyWithCast(const SrcType *src, size_t size) {
MACE_CHECK(static_cast<index_t>(size) == size_, MACE_CHECK(static_cast<index_t>(size) == size_,
"copy src and dst with different size."); "copy src and dst with different size.");
unique_ptr<DstType[]> buffer(new DstType[size]); unique_ptr<DstType[]> buffer(new DstType[size]);
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
buffer[i] = static_cast<DstType>(src[i]); buffer[i] = static_cast<DstType>(src[i]);
} }
CopyBytes(static_cast<const void*>(buffer.get()), sizeof(DstType) * size); CopyBytes(static_cast<const void *>(buffer.get()), sizeof(DstType) * size);
} }
inline void CopyBytes(const void* src, size_t size) { inline void CopyBytes(const void *src, size_t size) {
alloc_->CopyBytes(raw_mutable_data(), src, size); MappingGuard map_this(this);
memcpy(raw_mutable_data(), src, size);
} }
inline void DebugPrint() const { inline void DebugPrint() const {
...@@ -159,24 +193,47 @@ class Tensor { ...@@ -159,24 +193,47 @@ class Tensor {
return type_size; return type_size;
} }
inline void Copy(const Tensor& other) { inline void Copy(Tensor &other) {
alloc_ = other.alloc_; alloc_ = other.alloc_;
dtype_ = other.dtype_; dtype_ = other.dtype_;
ResizeLike(other); ResizeLike(other);
const void* other_data = other.raw_data(); MappingGuard map_other(&other);
memcpy(raw_mutable_data(), other_data, size_ * SizeOfType()); CopyBytes(other.raw_data(), size_ * SizeOfType());
} }
class MappingGuard {
public:
MappingGuard(Tensor *tensor) : tensor_(tensor) {
MACE_ASSERT(tensor_ != nullptr);
tensor_->Map();
}
~MappingGuard() { tensor_->Unmap(); }
private:
Tensor *tensor_;
};
private: private:
inline int64_t NumElements() const { inline int64_t NumElements() const {
return std::accumulate(shape_.begin(), shape_.end(), 1, return std::accumulate(shape_.begin(), shape_.end(), 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
} }
Allocator* alloc_; inline void *MappedBuffer() const {
if (OnHost()) {
return buffer_;
}
return data_;
}
Allocator *alloc_;
index_t size_; index_t size_;
DataType dtype_; DataType dtype_;
std::shared_ptr<void> data_; // Raw buffer, must be mapped as host accessable data before
// read or write
void *buffer_;
// Mapped buffer
void *data_;
vector<index_t> shape_; vector<index_t> shape_;
DISABLE_COPY_AND_ASSIGN(Tensor); DISABLE_COPY_AND_ASSIGN(Tensor);
......
void kernel simple_add(global const int *a,
global const int *b,
global int *c,
global const int *step) {
int id = get_global_id(0);
int start = step[0] * id;
int stop = start + step[0];
for (int i = start; i < stop; i++) c[i] = a[i] + b[i];
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册