提交 f23eb74b 编写于 作者: L liuqi

Refactor configuration APIs and Remove some global static variables.

上级 379c730d
......@@ -22,7 +22,6 @@
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
#include "mace/benchmark/statistics.h"
......@@ -257,36 +256,40 @@ int Main(int argc, char **argv) {
mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime
MaceStatus ret = mace::SetOpenMPThreadPolicy(
// configuration
MaceStatus mace_status;
MaceEngineConfig config(device_type);
mace_status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
true);
if (ret != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
if (mace_status != MACE_SUCCESS) {
LOG(INFO) << "Set openmp or cpu affinity failed.";
}
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) {
mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string storage_path =
std::string(storage_path_ptr == nullptr ?
"/data/local/tmp/mace_run/interior" : storage_path_ptr);
std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
mace::SetOpenCLBinaryPaths(opencl_binary_paths);
mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
gpu_context = GPUContextBuilder()
.SetStoragePath(storage_path)
.SetOpenCLBinaryPaths(opencl_binary_paths)
.SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
.Finalize();
config.SetGPUContext(gpu_context);
config.SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
}
#endif // MACE_ENABLE_OPENCL
const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string kernel_file_path =
std::string(kernel_path == nullptr ?
"/data/local/tmp/mace_run/interior" : kernel_path);
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(kernel_file_path));
SetKVStorageFactory(storage_factory);
// Create Engine
std::shared_ptr<mace::MaceEngine> engine;
MaceStatus create_engine_status;
......@@ -306,7 +309,7 @@ int Main(int argc, char **argv) {
model_data_file_ptr,
input_names,
output_names,
device_type,
config,
&engine);
#else
create_engine_status =
......@@ -314,7 +317,7 @@ int Main(int argc, char **argv) {
model_data_file_ptr,
input_names,
output_names,
device_type,
config,
&engine);
#endif
if (create_engine_status != MaceStatus::MACE_SUCCESS) {
......
......@@ -13,30 +13,12 @@
// limitations under the License.
#include "mace/core/allocator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_allocator.h"
#endif
namespace mace {
std::map<int32_t, Allocator *> *gAllocatorRegistry() {
static std::map<int32_t, Allocator *> g_allocator_registry;
return &g_allocator_registry;
Allocator *GetCPUAllocator() {
static CPUAllocator allocator;
return &allocator;
}
Allocator *GetDeviceAllocator(DeviceType type) {
auto iter = gAllocatorRegistry()->find(type);
if (iter == gAllocatorRegistry()->end()) {
LOG(ERROR) << "Allocator not found for device " << type;
return nullptr;
}
return iter->second;
}
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
#endif
MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
} // namespace mace
......@@ -26,8 +26,6 @@
#include "mace/core/registry.h"
#include "mace/core/types.h"
#include "mace/core/runtime_failure_mock.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
namespace mace {
......@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator {
bool OnHost() const override { return true; }
};
std::map<int32_t, Allocator *> *gAllocatorRegistry();
Allocator *GetDeviceAllocator(DeviceType type);
struct AllocatorRegisterer {
explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
if (gAllocatorRegistry()->count(type)) {
LOG(ERROR) << "Allocator for device type " << type
<< " registered twice. This should not happen."
<< gAllocatorRegistry()->count(type);
std::exit(1);
}
gAllocatorRegistry()->emplace(type, alloc);
}
};
#define MACE_REGISTER_ALLOCATOR(type, alloc) \
namespace { \
static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
}
// Global CPU allocator used for CPU/GPU/DSP
Allocator *GetCPUAllocator();
} // namespace mace
......
......@@ -20,7 +20,6 @@
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
namespace mace {
......
......@@ -218,9 +218,9 @@ class Buffer : public BufferBase {
class Image : public BufferBase {
public:
Image()
explicit Image(Allocator *allocator)
: BufferBase(0),
allocator_(GetDeviceAllocator(GPU)),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr) {}
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/device.h"
namespace mace {
CPUDevice::CPUDevice(const int num_threads)
: cpu_runtime_(new CPURuntime(num_threads)) {}
CPUDevice::~CPUDevice() = default;
CPURuntime *CPUDevice::cpu_runtime() {
return cpu_runtime_.get();
}
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *CPUDevice::opencl_runtime() {
return nullptr;
}
#endif
Allocator *CPUDevice::allocator() {
return GetCPUAllocator();
}
DeviceType CPUDevice::device_type() const {
return DeviceType::CPU;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_DEVICE_H_
#define MACE_CORE_DEVICE_H_
#include <memory>
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/allocator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
namespace mace {
class Device {
public:
virtual ~Device() {}
#ifdef MACE_ENABLE_OPENCL
virtual OpenCLRuntime *opencl_runtime() = 0;
#endif
virtual CPURuntime *cpu_runtime() = 0;
virtual Allocator *allocator() = 0;
virtual DeviceType device_type() const = 0;
};
class CPUDevice : public Device {
public:
explicit CPUDevice(const int num_threads);
virtual ~CPUDevice();
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *opencl_runtime() override;
#endif
CPURuntime *cpu_runtime() override;
Allocator *allocator() override;
DeviceType device_type() const override;
private:
std::unique_ptr<CPURuntime> cpu_runtime_;
};
} // namespace mace
#endif // MACE_CORE_DEVICE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/device_context.h"
#include <sys/stat.h>
namespace mace {
namespace {
const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin";
std::string FindFirstExistPath(const std::vector<std::string> &paths) {
std::string result;
struct stat st;
for (auto path : paths) {
if (stat(path.c_str(), &st) == 0) {
if (S_ISREG(st.st_mode)) {
result = path;
break;
}
}
}
return result;
}
} // namespace
GPUContext::GPUContext(const std::string &storage_path,
const std::vector<std::string> &opencl_binary_paths,
const std::string &opencl_parameter_path)
: storage_factory_(new FileStorageFactory(storage_path)),
opencl_tuner_(new Tuner<uint32_t>(opencl_parameter_path)) {
if (!storage_path.empty()) {
opencl_cache_storage_ =
storage_factory_->CreateStorage(kPrecompiledProgramFileName);
}
std::string precompiled_binary_path =
FindFirstExistPath(opencl_binary_paths);
if (!precompiled_binary_path.empty()) {
opencl_binary_storage_.reset(
new FileStorage(precompiled_binary_path));
}
}
GPUContext::~GPUContext() = default;
KVStorage *GPUContext::opencl_binary_storage() {
return opencl_binary_storage_.get();
}
KVStorage *GPUContext::opencl_cache_storage() {
return opencl_cache_storage_.get();
}
Tuner<uint32_t> *GPUContext::opencl_tuner() {
return opencl_tuner_.get();
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_DEVICE_CONTEXT_H_
#define MACE_CORE_DEVICE_CONTEXT_H_
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "mace/core/file_storage.h"
#include "mace/utils/tuner.h"
namespace mace {
class GPUContext {
public:
GPUContext(const std::string &storage_path = "",
const std::vector<std::string> &opencl_binary_path = {},
const std::string &opencl_parameter_path = "");
~GPUContext();
KVStorage *opencl_binary_storage();
KVStorage *opencl_cache_storage();
Tuner<uint32_t> *opencl_tuner();
private:
std::unique_ptr<KVStorageFactory> storage_factory_;
std::unique_ptr<Tuner<uint32_t>> opencl_tuner_;
std::unique_ptr<KVStorage> opencl_binary_storage_;
std::unique_ptr<KVStorage> opencl_cache_storage_;
};
} // namespace mace
#endif // MACE_CORE_DEVICE_CONTEXT_H_
......@@ -28,10 +28,36 @@
namespace mace {
std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
class FileStorageFactory::Impl {
public:
explicit Impl(const std::string &path);
std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
private:
std::string path_;
};
FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
const std::string &name) {
return std::move(std::unique_ptr<KVStorage>(
new FileStorage(path_ + "/" + name)));
}
FileStorageFactory::FileStorageFactory(const std::string &path):
impl_(new FileStorageFactory::Impl(path)) {}
FileStorageFactory::~FileStorageFactory() = default;
std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
const std::string &name) {
return impl_->CreateStorage(name);
}
FileStorage::FileStorage(const std::string &file_path):
data_changed_(false), file_path_(file_path) {}
loaded_(false), data_changed_(false), file_path_(file_path) {}
int FileStorage::Load() {
struct stat st;
......@@ -47,6 +73,9 @@ int FileStorage::Load() {
}
}
utils::WriteLock lock(&data_mutex_);
if (loaded_) {
return 0;
}
int fd = open(file_path_.c_str(), O_RDONLY);
if (fd < 0) {
if (errno == ENOENT) {
......@@ -118,13 +147,17 @@ int FileStorage::Load() {
<< " failed, error code: " << strerror(errno);
return -1;
}
loaded_ = true;
return 0;
}
void FileStorage::Clear() {
bool FileStorage::Clear() {
utils::WriteLock lock(&data_mutex_);
data_.clear();
data_changed_ = true;
if (!data_.empty()) {
data_.clear();
data_changed_ = true;
}
return true;
}
bool FileStorage::Insert(const std::string &key,
......
......@@ -16,27 +16,64 @@
#define MACE_CORE_FILE_STORAGE_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "mace/public/mace_runtime.h"
#include "mace/public/mace.h"
#include "mace/utils/rwlock.h"
namespace mace {
class KVStorage {
public:
// return: 0 for success, -1 for error
virtual int Load() = 0;
virtual bool Clear() = 0;
// insert or update the key-value.
virtual bool Insert(const std::string &key,
const std::vector<unsigned char> &value) = 0;
virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
// return: 0 for success, -1 for error
virtual int Flush() = 0;
virtual ~KVStorage() {}
};
class KVStorageFactory {
public:
virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
virtual ~KVStorageFactory() {}
};
class FileStorageFactory : public KVStorageFactory {
public:
// You have to make sure your APP have read and write permission of the path.
explicit FileStorageFactory(const std::string &path);
~FileStorageFactory();
std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
class FileStorage : public KVStorage {
public:
explicit FileStorage(const std::string &file_path);
public:
int Load() override;
void Clear() override;
bool Clear() override;
bool Insert(const std::string &key,
const std::vector<unsigned char> &value) override;
const std::vector<unsigned char> *Find(const std::string &key) override;
int Flush() override;
private:
bool loaded_;
bool data_changed_;
std::string file_path_;
std::map<std::string, std::vector<unsigned char>> data_;
......
......@@ -18,6 +18,7 @@
#include "mace/core/macros.h"
#include "mace/core/net.h"
#include "mace/public/mace.h"
#include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
......@@ -27,30 +28,35 @@ namespace mace {
NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type)
Device *device)
: name_(net_def->name()), op_registry_(op_registry) {
MACE_UNUSED(ws);
MACE_UNUSED(type);
MACE_UNUSED(device);
}
SerialNet::SerialNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode)
: NetBase(op_registry, net_def, ws, type), device_type_(type) {
: NetBase(op_registry, net_def, ws, device), device_(device),
op_kernel_context_(new OpKernelContext(ws, device)) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
DeviceType device_type = device->device_type();
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(device_type_));
if (op_device == type) {
operator_def, "device", static_cast<int>(device_type));
if (op_device == device_type) {
VLOG(3) << "Creating operator " << operator_def.name() << "("
<< operator_def.type() << ")";
OperatorDef temp_def(operator_def);
std::unique_ptr<OperatorBase> op(
op_registry->CreateOperator(temp_def, ws, type, mode));
op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
device_type, mode));
if (op) {
operators_.emplace_back(std::move(op));
}
......@@ -61,13 +67,14 @@ SerialNet::SerialNet(
MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net");
const DeviceType device_type = device_->device_type();
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), "), mem_id: ",
MakeListString(op->debug_def().mem_id().data(),
op->debug_def().mem_id().size()));
bool future_wait = (device_type_ == DeviceType::GPU &&
bool future_wait = (device_type == DeviceType::GPU &&
(run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1));
......@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} else {
future.wait_fn(nullptr);
}
#ifdef MACE_ENABLE_OPENCL
device_->opencl_runtime()->command_queue().finish();
#endif
} else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros();
MACE_RETURN_IF_ERROR(op->Run(nullptr));
......@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
VLOG(3) << "Operator " << op->debug_def().name()
<< " has shape: " << MakeString(op->Output(0)->shape());
if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) {
if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
for (int i = 0; i < op->OutputSize(); ++i) {
int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
if (data_type == static_cast<int>(DT_FLOAT)) {
......@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry,
const NetDef &net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode) {
std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
return CreateNet(op_registry, tmp_net_def, ws, type, mode);
return CreateNet(op_registry, tmp_net_def, ws, device, mode);
}
std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode) {
std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode));
new SerialNet(op_registry, net_def, ws, device, mode));
return net;
}
......
......@@ -20,7 +20,6 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -33,7 +32,7 @@ class NetBase {
NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type);
Device *device);
virtual ~NetBase() noexcept {}
virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
......@@ -52,14 +51,15 @@ class SerialNet : public NetBase {
SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode = NetMode::NORMAL);
MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
protected:
std::vector<std::unique_ptr<OperatorBase> > operators_;
DeviceType device_type_;
Device *device_;
std::unique_ptr<OpKernelContext> op_kernel_context_;
MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
};
......@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry,
const NetDef &net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode = NetMode::NORMAL);
std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
Device *device,
const NetMode mode = NetMode::NORMAL);
} // namespace mace
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/op_kernel_context.h"
namespace mace {
OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
: device_(device), ws_(ws) {}
OpKernelContext::~OpKernelContext() = default;
Device* OpKernelContext::device() {
return device_;
}
Workspace* OpKernelContext::workspace() {
return ws_;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
#define MACE_CORE_OP_KERNEL_CONTEXT_H_
#include "mace/core/device.h"
#include "mace/core/workspace.h"
namespace mace {
class OpKernelContext {
public:
OpKernelContext(Workspace *ws, Device *device);
~OpKernelContext();
Device *device();
Workspace *workspace();
private:
Device *device_;
Workspace *ws_;
};
} // namespace mace
#endif // MACE_CORE_OP_KERNEL_CONTEXT_H_
......@@ -18,12 +18,15 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/core/op_kernel_context.h"
namespace mace {
OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws)
: operator_ws_(ws),
operator_def_(std::make_shared<OperatorDef>(operator_def)) {}
OperatorBase::OperatorBase(const OperatorDef &operator_def,
OpKernelContext *context)
: operator_def_(std::make_shared<OperatorDef>(operator_def)) {
MACE_UNUSED(context);
}
OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
......@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {}
std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
const OperatorDef &operator_def,
Workspace *ws,
OpKernelContext *context,
DeviceType type,
const NetMode mode) const {
const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
......@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
.Device(type)
.TypeConstraint("T", static_cast<DataType>(dtype))
.Build(),
operator_def, ws);
operator_def, context);
} else {
return nullptr;
}
......
......@@ -22,17 +22,17 @@
#include "mace/core/arg_helper.h"
#include "mace/core/future.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/registry.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
namespace mace {
class OperatorBase {
public:
explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws);
explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
virtual ~OperatorBase() noexcept {}
template <typename T>
......@@ -78,7 +78,6 @@ class OperatorBase {
inline bool has_debug_def() const { return operator_def_ != nullptr; }
protected:
Workspace *operator_ws_;
std::shared_ptr<const OperatorDef> operator_def_;
std::vector<const Tensor *> inputs_;
std::vector<Tensor *> outputs_;
......@@ -89,8 +88,9 @@ class OperatorBase {
template <DeviceType D, class T>
class Operator : public OperatorBase {
public:
explicit Operator(const OperatorDef &operator_def, Workspace *ws)
: OperatorBase(operator_def, ws) {
explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
: OperatorBase(operator_def, context) {
Workspace *ws = context->workspace();
for (const std::string &input_str : operator_def.input()) {
const Tensor *tensor = ws->GetTensor(input_str);
MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
......@@ -116,7 +116,7 @@ class Operator : public OperatorBase {
output_type = DataTypeToEnum<T>::v();
}
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, GetDeviceAllocator(D), output_type)));
output_str, context->device()->allocator(), output_type)));
}
}
}
......@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
class OperatorRegistryBase {
public:
typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
typedef Registry<std::string,
OperatorBase,
const OperatorDef &,
OpKernelContext *>
RegistryType;
OperatorRegistryBase() = default;
virtual ~OperatorRegistryBase();
RegistryType *registry() { return &registry_; }
std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws,
OpKernelContext *context,
DeviceType type,
const NetMode mode) const;
......@@ -183,7 +186,7 @@ class OperatorRegistryBase {
MACE_DECLARE_REGISTRY(OpRegistry,
OperatorBase,
const OperatorDef &,
Workspace *);
OpKernelContext *);
#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
......
......@@ -22,7 +22,6 @@
#include <string>
#include <vector>
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
namespace mace {
......
......@@ -30,7 +30,6 @@
#include "public/gemmlowp.h"
#include "mace/core/macros.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h"
namespace mace {
......
......@@ -18,7 +18,6 @@
#include <vector>
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
namespace mace {
......@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
class CPURuntime {
public:
explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
~CPURuntime() = default;
inline int num_threads() const {
return num_threads_;
}
private:
int num_threads_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/gpu_device.h"
namespace mace {
GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
KVStorage *opencl_cache_storage,
const GPUPriorityHint priority,
const GPUPerfHint perf,
KVStorage *opencl_binary_storage,
const int num_threads) :
CPUDevice(num_threads),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {}
GPUDevice::~GPUDevice() = default;
OpenCLRuntime* GPUDevice::opencl_runtime() {
return runtime_.get();
}
Allocator* GPUDevice::allocator() {
return allocator_.get();
}
DeviceType GPUDevice::device_type() const {
return DeviceType::GPU;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
#include <memory>
#include "mace/core/device_context.h"
#include "mace/core/device.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
namespace mace {
class GPUDevice : public CPUDevice {
public:
GPUDevice(Tuner<uint32_t> *tuner,
KVStorage *opencl_cache_storage = nullptr,
const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
KVStorage *opencl_binary_storage = nullptr,
const int num_threads = -1);
~GPUDevice();
OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override;
DeviceType device_type() const override;
private:
std::unique_ptr<OpenCLRuntime> runtime_;
std::unique_ptr<OpenCLAllocator> allocator_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
......@@ -12,8 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
......@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
}
} // namespace
OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::OpenCLAllocator(
OpenCLRuntime *opencl_runtime):
opencl_runtime_(opencl_runtime) {}
OpenCLAllocator::~OpenCLAllocator() {}
MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
......@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
}
cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
nbytes, nullptr, &error);
if (error != CL_SUCCESS) {
......@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(),
new cl::Image2D(opencl_runtime_->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
image_shape[0], image_shape[1], 0, nullptr, &error);
if (error != CL_SUCCESS) {
......@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
}
void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
VLOG(3) << "Map OpenCL buffer";
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue();
auto queue = opencl_runtime_->command_queue();
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
......@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch) const {
MACE_CHECK(image_shape.size() == 2, "Just support map 2d image");
VLOG(3) << "Map OpenCL Image";
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch->resize(2);
cl_int error;
void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error);
......@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer,
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
VLOG(3) << "Unmap OpenCL buffer/Image";
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue();
auto queue = opencl_runtime_->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
nullptr, nullptr);
if (error != CL_SUCCESS) {
......
......@@ -15,15 +15,17 @@
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#include <memory>
#include <vector>
#include "mace/core/allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
class OpenCLAllocator : public Allocator {
public:
OpenCLAllocator();
explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime);
~OpenCLAllocator() override;
......@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator {
void Unmap(void *buffer, void *mapped_ptr) const override;
bool OnHost() const override;
private:
OpenCLRuntime *opencl_runtime_;
};
} // namespace mace
......
......@@ -24,11 +24,9 @@
#include <vector>
#include <utility>
#include "mace/public/mace_runtime.h"
#include "mace/core/macros.h"
#include "mace/core/file_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/public/mace.h"
#include "mace/utils/tuner.h"
namespace mace {
......@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) {
const char *kOpenCLPlatformInfoKey =
"mace_opencl_precompiled_platform_info_key";
const char *kPrecompiledProgramFileName =
"mace_cl_compiled_program.bin";
} // namespace
void OpenCLProfilingTimer::StartTiming() {}
void OpenCLProfilingTimer::StopTiming() {
OpenCLRuntime::Global()->command_queue().finish();
runtime_->command_queue().finish();
start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
}
......@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() {
accumulated_micros_ = 0;
}
GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
GPUPriorityHint::PRIORITY_DEFAULT;
std::string
OpenCLRuntime::kPrecompiledBinaryPath = ""; // NOLINT(runtime/string)
OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime runtime;
return &runtime;
}
void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
}
void OpenCLRuntime::ConfigureOpenCLBinaryPath(
const std::vector<std::string> &paths) {
OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
LOG(WARNING) << "There is no precompiled OpenCL binary file in "
<< MakeString(paths);
}
}
OpenCLRuntime::OpenCLRuntime():
precompiled_binary_storage_(nullptr),
cache_storage_(nullptr),
OpenCLRuntime::OpenCLRuntime(
KVStorage *cache_storage,
const GPUPriorityHint priority_hint,
const GPUPerfHint perf_hint,
KVStorage *precompiled_binary_storage,
Tuner<uint32_t> *tuner):
cache_storage_(cache_storage),
precompiled_binary_storage_(precompiled_binary_storage),
tuner_(tuner),
is_opencl_avaliable_(false),
is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN),
......@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime():
cl_command_queue_properties properties = 0;
const char *profiling = getenv("MACE_OPENCL_PROFILING");
if (Tuner<uint32_t>::Get()->IsTuning() ||
if (IsTuning() ||
(profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
properties |= CL_QUEUE_PROFILING_ENABLE;
is_profiling_enabled_ = true;
......@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime():
std::vector<cl_context_properties> context_properties;
context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties,
OpenCLRuntime::kGPUPerfHint,
OpenCLRuntime::kGPUPriorityHint);
perf_hint,
priority_hint);
context_ = std::shared_ptr<cl::Context>(
new cl::Context({*device_}, context_properties.data(),
nullptr, nullptr, &err));
......@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime():
return;
}
extern std::shared_ptr<KVStorageFactory> kStorageFactory;
std::string cached_binary_platform_info;
if (kStorageFactory != nullptr) {
cache_storage_ =
kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
if (cache_storage_ != nullptr) {
if (cache_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. "
<< "Please make sure the storage directory exist "
......@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime():
}
if (cached_binary_platform_info != platform_info_) {
if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
precompiled_binary_storage_.reset(
new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath));
if (precompiled_binary_storage_ == nullptr) {
VLOG(1) << "There is no precompiled OpenCL binary in"
" all OpenCL binary paths.";
} else {
if (precompiled_binary_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
<< "Please make sure the storage directory exist "
......@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
Tuner<uint32_t> *OpenCLRuntime::tuner() { return tuner_; }
uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
return device_gloabl_mem_cache_size_;
}
......
......@@ -22,11 +22,12 @@
#include <string>
#include <vector>
#include "mace/core/file_storage.h"
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/string_util.h"
#include "mace/utils/timer.h"
#include "mace/utils/tuner.h"
namespace mace {
......@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error);
return MaceStatus::MACE_OUT_OF_RESOURCES; \
}
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0) {}
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private:
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
double accumulated_micros_;
};
class OpenCLRuntime {
public:
static OpenCLRuntime *Global();
static void Configure(GPUPerfHint, GPUPriorityHint);
static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths);
OpenCLRuntime(
KVStorage *cache_storage = nullptr,
const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL,
const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL,
KVStorage *precompiled_binary_storage = nullptr,
Tuner<uint32_t> *tuner = nullptr);
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
cl::Context &context();
cl::Device &device();
......@@ -91,6 +80,7 @@ class OpenCLRuntime {
const std::string platform_info() const;
uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner();
bool is_opencl_avaliable();
void GetCallStats(const cl::Event &event, CallStats *stats);
......@@ -112,11 +102,6 @@ class OpenCLRuntime {
void SaveBuiltCLProgram();
private:
OpenCLRuntime();
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
bool BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name,
const std::string &build_options,
......@@ -137,10 +122,13 @@ class OpenCLRuntime {
OpenCLVersion ParseDeviceVersion(const std::string &device_version);
private:
std::unique_ptr<KVStorage> precompiled_binary_storage_;
std::unique_ptr<KVStorage> cache_storage_;
KVStorage *cache_storage_;
KVStorage *precompiled_binary_storage_;
Tuner<uint32_t> *tuner_;
bool is_opencl_avaliable_;
bool is_profiling_enabled_;
OpenCLVersion opencl_version_;
GPUType gpu_type_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
std::shared_ptr<cl::Context> context_;
......@@ -149,18 +137,30 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_;
std::string platform_info_;
OpenCLVersion opencl_version_;
std::string precompiled_binary_platform_info_;
bool out_of_range_check_;
uint64_t device_gloabl_mem_cache_size_;
uint32_t device_compute_units_;
GPUType gpu_type_;
static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint;
static std::string kPrecompiledBinaryPath;
};
class OpenCLProfilingTimer : public Timer {
public:
OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event)
: runtime_(runtime), event_(event), accumulated_micros_(0) {}
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private:
OpenCLRuntime *runtime_;
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
double accumulated_micros_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
......@@ -25,7 +25,6 @@
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_NEON
......@@ -38,10 +37,10 @@
namespace mace {
#define MACE_SINGLE_ARG(...) __VA_ARGS__
#define MACE_CASE(TYPE, STATEMENTS) \
#define MACE_CASE(TYPE, STATEMENTS) \
case DataTypeToEnum<TYPE>::value: { \
typedef TYPE T; \
STATEMENTS; \
STATEMENTS; \
break; \
}
......@@ -137,7 +136,7 @@ class Tensor {
buffer_ = &buffer_slice_;
}
Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
~Tensor() {
if (is_buffer_owner_ && buffer_ != nullptr) {
......@@ -270,7 +269,7 @@ class Tensor {
image_shape_ = image_shape;
if (buffer_ == nullptr) {
MACE_CHECK(is_buffer_owner_);
buffer_ = new Image();
buffer_ = new Image(allocator_);
return buffer_->Allocate(image_shape, dtype_);
} else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
......
......@@ -16,15 +16,10 @@
#include "gflags/gflags.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
......@@ -43,10 +38,6 @@ int main(int argc, char **argv) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
mace::OpenCLRuntime::Configure(
static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
mace::testing::Benchmark::Run(FLAGS_filter.c_str());
return 0;
}
......@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/workspace.h"
#include <memory>
#include <string>
#include <vector>
#include <unordered_set>
......@@ -21,8 +24,6 @@
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
namespace mace {
......@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
}
} // namespace
Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
GetDeviceAllocator(DeviceType::CPU))) {}
Workspace::Workspace() :
host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc,
......@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const {
}
MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
DeviceType type,
Device *device,
const unsigned char *model_data) {
MACE_LATENCY_LOGGER(1, "Load model tensors");
index_t model_data_size = 0;
......@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
}
VLOG(3) << "Model data size: " << model_data_size;
const DeviceType device_type = device->device_type();
if (model_data_size > 0) {
#ifdef MACE_ENABLE_OPENCL
if (type == DeviceType::GPU &&
OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
if (device_type == DeviceType::GPU &&
device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
static_cast<uint64_t>(model_data_size)) {
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
......@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
}
std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(type),
new Tensor(device->allocator(),
const_tensor.data_type(), true));
tensor->Resize(dims);
......@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
#else
{
#endif
if (type == DeviceType::CPU) {
if (device_type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type),
new Buffer(device->allocator(),
const_cast<unsigned char*>(model_data),
model_data_size));
} else {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type)));
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
......@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
}
}
if (type == DeviceType::CPU || type == DeviceType::GPU) {
MaceStatus status = CreateOutputTensorBuffer(net_def, type);
if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
MaceStatus status = CreateOutputTensorBuffer(net_def, device);
if (status != MaceStatus::MACE_SUCCESS) return status;
}
if (type == DeviceType::CPU && net_def.has_quantize_info()) {
if (device_type == DeviceType::CPU && net_def.has_quantize_info()) {
for (const auto
&activation_info: net_def.quantize_info().activation_info()) {
if (HasTensor(activation_info.tensor_name())) {
......@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
}
MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type) {
Device *device) {
DeviceType device_type = device->device_type();
DataType dtype = DataType::DT_INVALID;
if (net_def.mem_arena().mem_block_size() > 0) {
// We use the data type of the first op with mem id,
......@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
<< ", memory type: " << mem_block.mem_type();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::CPU)));
new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
......@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image());
new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::GPU)));
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
......@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
op, "T", static_cast<int>(DT_FLOAT)));
}
CreateTensor(op.output(i),
GetDeviceAllocator(device_type),
device->allocator(),
output_type);
}
}
......@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() {
}
void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data) {
const unsigned char *model_data,
Allocator *alloc) {
for (auto &const_tensor : net_def.tensors()) {
auto iter = tensor_map_.find(const_tensor.name());
if (iter->second->unused()) {
......@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
dims.push_back(d);
}
std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(DeviceType::GPU),
const_tensor.data_type()));
new Tensor(alloc, const_tensor.data_type()));
tensor->Resize(dims);
MACE_CHECK(tensor->size() == const_tensor.data_size(),
"Tensor's data_size not equal with the shape");
......
......@@ -20,6 +20,7 @@
#include <vector>
#include <memory>
#include "mace/core/device.h"
#include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
......@@ -48,7 +49,7 @@ class Workspace {
std::vector<std::string> Tensors() const;
MaceStatus LoadModelTensor(const NetDef &net_def,
DeviceType type,
Device *device,
const unsigned char *model_data);
ScratchBuffer *GetScratchBuffer(DeviceType device_type);
......@@ -56,11 +57,14 @@ class Workspace {
void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data);
const unsigned char *model_data,
Allocator *alloc);
private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type);
Device *device);
Device *device_;
TensorMap tensor_map_;
......
......@@ -37,15 +37,13 @@ public class AppModel {
mJniThread = new Handler(thread.getLooper());
}
public void maceMobilenetSetAttrs(final InitData initData) {
public void maceMobilenetCreateGPUContext(final InitData initData) {
mJniThread.post(new Runnable() {
@Override
public void run() {
int result = JniMaceUtils.maceMobilenetSetAttrs(
initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
initData.getKernelPath());
Log.i("APPModel", "maceMobilenetSetAttrs result = " + result);
int result = JniMaceUtils.maceMobilenetCreateGPUContext(
initData.getStoragePath());
Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result);
}
});
}
......@@ -54,7 +52,10 @@ public class AppModel {
mJniThread.post(new Runnable() {
@Override
public void run() {
int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice());
int result = JniMaceUtils.maceMobilenetCreateEngine(
initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
initData.getModel(), initData.getDevice());
Log.i("APPModel", "maceMobilenetCreateEngine result = " + result);
if (result == -1) {
......
......@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap
}
private void initJni() {
AppModel.instance.maceMobilenetSetAttrs(initData);
AppModel.instance.maceMobilenetCreateGPUContext(initData);
AppModel.instance.maceMobilenetCreateEngine(initData, this);
}
......
......@@ -29,7 +29,7 @@ public class InitData {
private int cpuAffinityPolicy;
private int gpuPerfHint;
private int gpuPriorityHint;
private String kernelPath = "";
private String storagePath = "";
public InitData() {
model = MODELS[0];
......@@ -38,8 +38,8 @@ public class InitData {
gpuPerfHint = 3;
gpuPriorityHint = 3;
device = DEVICES[0];
kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
File file = new File(kernelPath);
storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
File file = new File(storagePath);
if (!file.exists()) {
file.mkdir();
}
......@@ -94,11 +94,11 @@ public class InitData {
this.gpuPriorityHint = gpuPriorityHint;
}
public String getKernelPath() {
return kernelPath;
public String getStoragePath() {
return storagePath;
}
public void setKernelPath(String kernelPath) {
this.kernelPath = kernelPath;
public void setStoragePath(String storagePath) {
this.storagePath = storagePath;
}
}
......@@ -26,7 +26,6 @@
#include <numeric>
#include "src/main/cpp/include/mace/public/mace.h"
#include "src/main/cpp/include/mace/public/mace_runtime.h"
#include "src/main/cpp/include/mace/public/mace_engine_factory.h"
namespace {
......@@ -39,8 +38,8 @@ struct ModelInfo {
};
struct MaceContext {
std::shared_ptr<mace::GPUContext> gpu_context;
std::shared_ptr<mace::MaceEngine> engine;
std::shared_ptr<mace::KVStorageFactory> storage_factory;
std::string model_name;
mace::DeviceType device_type = mace::DeviceType::CPU;
std::map<std::string, ModelInfo> model_infos = {
......@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() {
} // namespace
JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs(
JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) {
JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
JNIEnv *env, jclass thisObj, jstring storage_path) {
MaceContext &mace_context = GetMaceContext();
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr);
if (storage_path_ptr == nullptr) return JNI_ERR;
const std::string storage_file_path(storage_path_ptr);
env->ReleaseStringUTFChars(storage_path, storage_path_ptr);
mace::MaceStatus status;
// openmp
status = mace::SetOpenMPThreadPolicy(
omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"openmp result: %d, threads: %d, cpu: %d",
status, omp_num_threads, cpu_affinity_policy);
// gpu
mace::SetGPUHints(
static_cast<mace::GPUPerfHint>(gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"gpu perf: %d, priority: %d",
gpu_perf_hint, gpu_priority_hint);
// opencl cache
const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr);
if (kernel_path_ptr == nullptr) return JNI_ERR;
const std::string kernel_file_path(kernel_path_ptr);
mace_context.storage_factory.reset(
new mace::FileStorageFactory(kernel_file_path));
mace::SetKVStorageFactory(mace_context.storage_factory);
env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr);
mace_context.gpu_context = mace::GPUContextBuilder()
.SetStoragePath(storage_file_path)
.Finalize();
return JNI_OK;
}
JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) {
JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
jint gpu_perf_hint, jint gpu_priority_hint,
jstring model_name_str, jstring device) {
MaceContext &mace_context = GetMaceContext();
// get device
const char *device_ptr = env->GetStringUTFChars(device, nullptr);
if (device_ptr == nullptr) return JNI_ERR;
mace_context.device_type = ParseDeviceType(device_ptr);
env->ReleaseStringUTFChars(device, device_ptr);
// create MaceEngineConfig
mace::MaceStatus status;
mace::MaceEngineConfig config(mace_context.device_type);
status = config.SetCPUThreadPolicy(
omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
if (status != mace::MACE_SUCCESS) {
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"openmp result: %d, threads: %d, cpu: %d",
status, omp_num_threads, cpu_affinity_policy);
}
if (mace_context.device_type == mace::DeviceType::GPU) {
config.SetGPUContext(mace_context.gpu_context);
config.SetGPUHints(
static_cast<mace::GPUPerfHint>(gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
__android_log_print(ANDROID_LOG_INFO,
"image_classify attrs",
"gpu perf: %d, priority: %d",
gpu_perf_hint, gpu_priority_hint);
}
__android_log_print(ANDROID_LOG_INFO,
"image_classify attrs",
"device: %d",
mace_context.device_type);
// parse model name
const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr);
if (model_name_ptr == nullptr) return JNI_ERR;
......@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
std::vector<std::string> input_names = {model_info_iter->second.input_name};
std::vector<std::string> output_names = {model_info_iter->second.output_name};
// get device
const char *device_ptr = env->GetStringUTFChars(device, nullptr);
if (device_ptr == nullptr) return JNI_ERR;
mace_context.device_type = ParseDeviceType(device_ptr);
env->ReleaseStringUTFChars(device, device_ptr);
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"device: %d",
mace_context.device_type);
mace::MaceStatus create_engine_status =
CreateMaceEngineFromCode(mace_context.model_name,
std::string(),
input_names,
output_names,
mace_context.device_type,
config,
&mace_context.engine);
__android_log_print(ANDROID_LOG_ERROR,
__android_log_print(ANDROID_LOG_INFO,
"image_classify attrs",
"create result: %d",
create_engine_status);
......
......@@ -24,11 +24,13 @@ extern "C" {
#endif
/*
* Class: com_xiaomi_mace_JniMaceUtils
* Method: maceMobilenetSetAttrs
* Method: maceMobilenetCreateGPUContext
* Signature: (Ljava/lang/String;IIIILjava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
(JNIEnv *, jclass, jint, jint, jint, jint, jstring);
JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *,
jclass,
jstring);
/*
* Class: com_xiaomi_mace_JniMaceUtils
......@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
*/
JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine
(JNIEnv *, jclass, jstring, jstring);
(JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring);
/*
* Class: com_xiaomi_mace_JniMaceUtils
......
......@@ -20,9 +20,9 @@ public class JniMaceUtils {
System.loadLibrary("mace_mobile_jni");
}
public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath);
public static native int maceMobilenetCreateGPUContext(String storagePath);
public static native int maceMobilenetCreateEngine(String model, String device);
public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device);
public static native float[] maceMobilenetClassify(float[] input);
......
......@@ -21,7 +21,6 @@
#include "gflags/gflags.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
// if convert model to code.
#ifdef MODEL_GRAPH_FORMAT_CODE
#include "mace/codegen/engine/mace_engine_factory.h"
......@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names,
const std::vector<std::vector<int64_t>> &output_shapes) {
// load model
DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime
mace::SetOpenMPThreadPolicy(
// configuration
// Detailed information please see mace.h
MaceStatus status;
MaceEngineConfig config(device_type);
status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (status != MACE_SUCCESS) {
std::cerr << "Set openmp or cpu affinity failed." << std::endl;
}
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) {
mace::SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
// Just call once. (Not thread-safe)
// Set paths of Generated OpenCL Compiled Kernel Binary file
// if you build gpu library of specific soc.
// Using OpenCL binary will speed up the initialization.
// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string storage_path =
std::string(storage_path_ptr == nullptr ?
"/data/local/tmp/mace_run/interior" : storage_path_ptr);
std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
mace::SetOpenCLBinaryPaths(opencl_binary_paths);
mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file);
gpu_context = GPUContextBuilder()
.SetStoragePath(storage_path)
.SetOpenCLBinaryPaths(opencl_binary_paths)
.SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
.Finalize();
config.SetGPUContext(gpu_context);
config.SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
}
#endif // MACE_ENABLE_OPENCL
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
// Just call once
const std::string internal_storage_path =
"/data/local/tmp/mace_run/interior";
// Config internal kv storage factory.
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(internal_storage_path));
SetKVStorageFactory(storage_factory);
// Create Engine
std::shared_ptr<mace::MaceEngine> engine;
MaceStatus create_engine_status;
......@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names,
FLAGS_model_data_file,
input_names,
output_names,
device_type,
config,
&engine);
#else
std::vector<unsigned char> model_pb_data;
......@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names,
FLAGS_model_data_file,
input_names,
output_names,
device_type,
config,
&engine);
#endif
......
......@@ -23,6 +23,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -126,10 +127,14 @@ template <DeviceType D, typename T>
class ActivationFunctor;
template <>
class ActivationFunctor<DeviceType::CPU, float> {
class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
public:
ActivationFunctor(ActivationType type, float relux_max_limit)
: activation_(type), relux_max_limit_(relux_max_limit) {}
ActivationFunctor(OpKernelContext *context,
ActivationType type,
float relux_max_limit)
: OpKernel(context),
activation_(type),
relux_max_limit_(relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *alpha,
......@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
class ActivationFunctor<DeviceType::GPU, T> {
class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
public:
ActivationFunctor(ActivationType type, T relux_max_limit)
: activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {}
ActivationFunctor(OpKernelContext *context,
ActivationType type,
T relux_max_limit)
: OpKernel(context),
activation_(type),
relux_max_limit_(static_cast<T>(relux_max_limit)) {}
MaceStatus operator()(const Tensor *input,
const Tensor *alpha,
......
......@@ -24,6 +24,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -35,10 +36,11 @@ namespace kernels {
constexpr int kCostPerGroup = 1024;
template <DeviceType D, typename T>
struct AddNFunctor {
struct AddNFunctor : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future) {
Tensor *output_tensor,
StatsFuture *future) {
MACE_UNUSED(future);
MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size();
......@@ -95,7 +97,8 @@ struct AddNFunctor {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct AddNFunctor<DeviceType::GPU, T> {
struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor,
StatsFuture *future);
......
......@@ -23,6 +23,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#include "mace/utils/utils.h"
......@@ -30,7 +31,8 @@ namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct ArgMaxFunctor {
struct ArgMaxFunctor : OpKernel {
explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input,
const Tensor *axis,
Tensor *output,
......
......@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) {
index_t filter_size = 3 * 3 * in_channels * out_channels;
index_t output_size = batch * out_channels * out_height * out_width;
Tensor input;
Tensor filter;
Tensor output;
Tensor output_ref;
Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
input.Resize({batch, in_channels, in_height, in_width});
filter.Resize({out_channels, in_channels, 3, 3});
......
......@@ -33,11 +33,13 @@
namespace mace {
namespace kernels {
struct BatchNormFunctorBase {
BatchNormFunctorBase(bool folded_constant,
struct BatchNormFunctorBase : OpKernel {
BatchNormFunctorBase(OpKernelContext *context,
bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: folded_constant_(folded_constant),
: OpKernel(context),
folded_constant_(folded_constant),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
......@@ -51,10 +53,14 @@ struct BatchNormFunctor;
template<>
struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *scale,
......@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
: BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -30,10 +31,10 @@
namespace mace {
namespace kernels {
struct BiasAddFunctorBase {
explicit BiasAddFunctorBase(const DataFormat data_format) {
data_format_ = data_format;
}
struct BiasAddFunctorBase : OpKernel {
BiasAddFunctorBase(OpKernelContext *context,
const DataFormat data_format)
: OpKernel(context), data_format_(data_format) {}
DataFormat data_format_;
};
......@@ -43,8 +44,9 @@ struct BiasAddFunctor;
template <>
struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
explicit BiasAddFunctor(const DataFormat data_format)
: BiasAddFunctorBase(data_format) {}
BiasAddFunctor(OpKernelContext *context,
const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {}
MaceStatus operator()(const Tensor *input,
const Tensor *bias,
......@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
explicit BiasAddFunctor(const DataFormat data_format)
: BiasAddFunctorBase(data_format) {}
BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {}
MaceStatus operator()(const Tensor *input,
const Tensor *bias,
Tensor *output,
......
......@@ -20,21 +20,24 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/opencl/common.h"
namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
explicit BufferToImageFunctorBase(const int wino_blk_size)
: wino_blk_size_(wino_blk_size) {}
struct BufferToImageFunctorBase : OpKernel {
explicit BufferToImageFunctorBase(OpKernelContext *context,
const int wino_blk_size)
: OpKernel(context), wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
explicit BufferToImageFunctor(OpKernelContext *context,
const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
explicit BufferToImageFunctor(const int wino_blk_size)
: BufferToImageFunctorBase(wino_blk_size) {}
explicit BufferToImageFunctor(OpKernelContext *context,
const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......
......@@ -20,13 +20,15 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
struct ChannelShuffleFunctor {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
struct ChannelShuffleFunctor : OpKernel {
ChannelShuffleFunctor(OpKernelContext *context, const int groups)
: OpKernel(context), groups_(groups) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct ChannelShuffleFunctor<DeviceType::GPU, T> {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
ChannelShuffleFunctor(OpKernelContext *context, const int groups)
: OpKernel(context), groups_(groups) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -30,15 +31,17 @@
namespace mace {
namespace kernels {
struct ConcatFunctorBase {
explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
struct ConcatFunctorBase : OpKernel {
ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_;
};
template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......
......@@ -42,14 +42,16 @@
namespace mace {
namespace kernels {
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
struct Conv2dFunctorBase : OpKernel {
Conv2dFunctorBase(OpKernelContext *context,
const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: strides_(strides),
: OpKernel(context),
strides_(strides),
padding_type_(padding_type),
paddings_(paddings),
dilations_(dilations),
......@@ -69,7 +71,8 @@ struct Conv2dFunctor;
template<>
struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const int *dilations,
......@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit),
transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
is_filter_transformed_(is_filter_transformed),
scratch_(scratch) {}
......@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
template<>
struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const int *dilations,
......@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
......@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const int *dilations,
......@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
const float relux_max_limit,
const bool is_filter_transformed,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
: Conv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
......@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
}
MaceStatus operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -30,10 +31,12 @@
namespace mace {
namespace kernels {
struct CropFunctorBase {
CropFunctorBase(const int axis,
struct CropFunctorBase : OpKernel {
CropFunctorBase(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: axis_(axis),
: OpKernel(context),
axis_(axis),
offset_(offset) {}
const int axis_;
......@@ -42,8 +45,10 @@ struct CropFunctorBase {
template <DeviceType D, typename T>
struct CropFunctor : CropFunctorBase {
CropFunctor(const int axis, const std::vector<int> &offset)
: CropFunctorBase(axis, offset) {}
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
void crop_copy(const T* input_data, T* output_data,
const std::vector<index_t> &input_shape,
......@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
CropFunctor(const int axis, const std::vector<int> &offset)
: CropFunctorBase(axis, offset) {}
CropFunctor(OpKernelContext *context,
const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future);
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_;
......
......@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input,
}
} // namespace deconv
struct Deconv2dFunctorBase {
Deconv2dFunctorBase(const std::vector<int> &strides,
struct Deconv2dFunctorBase : OpKernel {
Deconv2dFunctorBase(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const std::vector<index_t> &output_shape,
const ActivationType activation,
const float relux_max_limit)
: strides_(strides),
: OpKernel(context),
strides_(strides),
padding_type_(padding_type),
paddings_(paddings),
output_shape_(output_shape),
......@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase {
template <DeviceType D, typename T>
struct Deconv2dFunctor : Deconv2dFunctorBase {
Deconv2dFunctor(const std::vector<int> &strides,
Deconv2dFunctor(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const std::vector<index_t> &output_shape,
const ActivationType activation,
const float relux_max_limit)
: Deconv2dFunctorBase(strides,
: Deconv2dFunctorBase(context,
strides,
padding_type,
paddings,
output_shape,
......@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Deconv2dFunctor(const std::vector<int> &strides,
Deconv2dFunctor(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const std::vector<index_t> &output_shape,
const ActivationType activation,
const float relux_max_limit)
: Deconv2dFunctorBase(strides,
: Deconv2dFunctorBase(context,
strides,
padding_type,
paddings,
output_shape,
......
......@@ -19,6 +19,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -29,9 +30,11 @@ namespace mace {
namespace kernels {
template<DeviceType D, typename T>
struct DepthToSpaceOpFunctor {
explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
: block_size_(block_size), d2s_(d2s) {}
struct DepthToSpaceOpFunctor : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size,
bool d2s)
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
......@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor {
#ifdef MACE_ENABLE_OPENCL
template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> {
DepthToSpaceOpFunctor(const int block_size, bool d2s)
: block_size_(block_size), d2s_(d2s) {}
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
DepthToSpaceOpFunctor(OpKernelContext *context,
const int block_size,
bool d2s)
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
StatsFuture *future);
......
......@@ -37,14 +37,16 @@
namespace mace {
namespace kernels {
struct DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctorBase(const int *strides,
struct DepthwiseConv2dFunctorBase : OpKernel {
DepthwiseConv2dFunctorBase(OpKernelContext *context,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: strides_(strides),
: OpKernel(context),
strides_(strides),
padding_type_(padding_type),
paddings_(paddings),
dilations_(dilations),
......@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor;
template<>
struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
: public DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides,
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
......@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
template<>
struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
: public DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides,
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
......@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
const int32_t *bias_data = nullptr;
if (bias == nullptr) {
zero_bias.reset(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
new Tensor(GetCPUAllocator(), DT_INT32));
zero_bias->Resize(bias_shape);
zero_bias->Clear();
bias_data = zero_bias->data<int32_t>();
......@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides,
: DepthwiseConv2dFunctorBase(context,
strides,
padding_type,
paddings,
dilations,
......
......@@ -23,6 +23,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type,
struct EltwiseFunctorBase : OpKernel {
EltwiseFunctorBase(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const int32_t scalar_input_index,
const DataFormat data_format)
: type_(type),
: OpKernel(context),
type_(type),
coeff_(coeff),
scalar_input_(scalar_input),
scalar_input_index_(scalar_input_index),
......@@ -823,12 +826,14 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(type,
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
......@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff,
const float scalar_input,
const int32_t scalar_input_index,
const DataFormat data_format)
: EltwiseFunctorBase(type,
: EltwiseFunctorBase(context,
type,
coeff,
scalar_input,
scalar_input_index,
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -30,8 +31,8 @@ template <DeviceType D, class T>
struct FillFunctor;
template <>
struct FillFunctor<DeviceType::CPU, float> {
FillFunctor() {}
struct FillFunctor<DeviceType::CPU, float> : OpKernel {
explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *shape,
const Tensor *value,
......
......@@ -27,10 +27,12 @@
namespace mace {
namespace kernels {
struct FullyConnectedBase {
FullyConnectedBase(const ActivationType activation,
struct FullyConnectedBase : OpKernel {
FullyConnectedBase(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit)
: activation_(activation),
: OpKernel(context),
activation_(activation),
relux_max_limit_(relux_max_limit) {}
const ActivationType activation_;
......@@ -42,9 +44,10 @@ struct FullyConnectedFunctor;
template <>
struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
: FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
......@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
template <>
struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
: FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
......@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
const int32_t *bias_ptr = nullptr;
if (bias == nullptr) {
zero_bias.reset(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
new Tensor(GetCPUAllocator(), DT_INT32));
zero_bias->Resize(bias_shape);
zero_bias->Clear();
bias_ptr = zero_bias->data<int32_t>();
......@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
: FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
......
......@@ -21,13 +21,15 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
namespace mace {
namespace kernels {
struct GatherBase {
explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {}
struct GatherBase : OpKernel {
GatherBase(OpKernelContext *context, int axis, float y)
: OpKernel(context), axis_(axis), y_(y) {}
int axis_;
float y_;
......@@ -38,7 +40,8 @@ struct GatherFunctor;
template <>
struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {}
GatherFunctor(OpKernelContext *context, int axis, float y)
: GatherBase(context, axis, y) {}
MaceStatus operator()(const Tensor *params,
const Tensor *indices,
......
......@@ -1341,8 +1341,8 @@ void Gemm(const float *A,
ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k);
const index_t ik_end = std::min(K, ik_begin + this_block_size_k);
Tensor trans_a;
Tensor trans_b;
Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
const float *real_a = nullptr;
const float *real_b = nullptr;
float *real_c = c_base + (ih_begin * width + iw_begin);
......@@ -1399,8 +1399,8 @@ void GemmRef(const float *A,
const bool transpose_b) {
memset(C, 0, sizeof(float) * batch * height * width);
Tensor trans_a;
Tensor trans_b;
Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
float *trans_a_data = nullptr;
float *trans_b_data = nullptr;
if (transpose_a) {
......
......@@ -20,21 +20,24 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/opencl/common.h"
namespace mace {
namespace kernels {
struct ImageToBufferFunctorBase {
explicit ImageToBufferFunctorBase(const int wino_blk_size)
: wino_blk_size_(wino_blk_size) {}
struct ImageToBufferFunctorBase : OpKernel {
ImageToBufferFunctorBase(OpKernelContext *context,
const int wino_blk_size)
: OpKernel(context),
wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_;
};
template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {}
ImageToBufferFunctor(OpKernelContext *context,
const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input,
const BufferType type,
Tensor *output,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_KERNEL_H_
#define MACE_KERNELS_KERNEL_H_
#include "mace/core/op_kernel_context.h"
namespace mace {
namespace kernels {
struct OpKernel {
explicit OpKernel(OpKernelContext *context): context_(context) {}
OpKernelContext *context_;
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_KERNEL_H_
......@@ -21,7 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -34,7 +34,9 @@ template<DeviceType D, typename T>
struct LocalResponseNormFunctor;
template<>
struct LocalResponseNormFunctor<DeviceType::CPU, float> {
struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
explicit LocalResponseNormFunctor(OpKernelContext *context)
: OpKernel(context) {}
MaceStatus operator()(const Tensor *input,
int depth_radius,
float bias,
......
......@@ -23,6 +23,7 @@
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
......@@ -35,9 +36,10 @@ template <DeviceType D, typename T>
struct LSTMCellFunctor;
template <typename T>
struct LSTMCellFunctor<DeviceType::GPU, T> {
explicit LSTMCellFunctor(T forget_bias) :
forget_bias_(static_cast<T>(forget_bias)) {}
struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
LSTMCellFunctor(OpKernelContext *context, T forget_bias)
: OpKernel(context),
forget_bias_(static_cast<T>(forget_bias)) {}
MaceStatus operator()(const Tensor *input,
const Tensor *pre_output,
const Tensor *weight,
......
......@@ -29,6 +29,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/utils.h"
#include "mace/kernels/gemmlowp_util.h"
......@@ -40,7 +41,8 @@ namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct MatMulFunctor {
struct MatMulFunctor : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
......@@ -87,7 +89,7 @@ struct MatMulFunctor {
// A * B = (B^T * A^T)^T
if (!transpose_b) {
if (B_transpose_.get() == nullptr) {
B_transpose_.reset(new Tensor(GetDeviceAllocator(D),
B_transpose_.reset(new Tensor(context_->device()->allocator(),
DataTypeToEnum<T>::v()));
B_transpose_->Resize({batch, width, K});
Tensor::MappingGuard guardbt(B_transpose_.get());
......@@ -112,7 +114,8 @@ struct MatMulFunctor {
};
template <>
struct MatMulFunctor<CPU, uint8_t> {
struct MatMulFunctor<CPU, uint8_t> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
void MatMulImpl(const Tensor *A,
const Tensor *B,
......@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> {
struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *A,
const Tensor *B,
Tensor *C,
......
......@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name);
......@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws,
lws, future));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3);
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]);
......@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
MACE_NOT_IMPLEMENTED;
}
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
......@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
......@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name);
......@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
cl_int error;
......
......@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}
}
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......
......@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name);
......@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -22,13 +22,15 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
......@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace
static MaceStatus Concat2(cl::Kernel *kernel,
static MaceStatus Concat2(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
......@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel,
static_cast<uint32_t>(batch * height),
};
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name);
......@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel,
*prev_input_shape = input0->shape();
}
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
}
static MaceStatus ConcatN(cl::Kernel *kernel,
static MaceStatus ConcatN(OpKernelContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
......@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
const index_t height = output->dim(1);
const index_t width = output->dim(2);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
......@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR;
......@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]);
}
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
......@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
}
}
if (future != nullptr) {
future->wait_fn = [runtime, call_stats](CallStats *stats) {
future->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
......@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
switch (inputs_count) {
case 2:
return Concat2(&kernel_, input_list[0], input_list[1],
return Concat2(context_,
&kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_);
default:
if (divisible_four) {
return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output,
return ConcatN(context_,
&kernel_, input_list, DataTypeToEnum<T>::value, output,
future, &kwg_size_, &kernel_error_);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -18,7 +18,8 @@
namespace mace {
namespace kernels {
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
const Tensor *filter, const Tensor *bias, const int stride,
const int *padding, const int *dilations,
const ActivationType activation,
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
......@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 3 &&
selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1];
return conv2d_func(
return conv2d_func(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
} else {
return Conv2dOpencl(
return Conv2dOpencl(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
......
......@@ -25,14 +25,16 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = runtime->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
......@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
MACE_CHECK(input_batch == batch);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
......@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
......
......@@ -24,15 +24,17 @@ namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1);
runtime->device_compute_units() / 2, 1);
const uint32_t base =
std::max<uint32_t>(
std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
......@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
......@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS;
......
......@@ -26,7 +26,8 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 20;
std::vector<uint32_t> LocalWS(const uint32_t *gws,
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kernel_size,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
......@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = runtime->device_compute_units();
const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
......@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
extern MaceStatus Conv2dOpencl(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name);
......@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
......
......@@ -22,13 +22,15 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
......@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(output->dim(0) * output->dim(1))
};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name);
......@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -20,7 +20,8 @@ namespace kernels {
namespace {
MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
MaceStatus Deconv2dOpencl(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name);
......@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
......@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(),
paddings.data(), activation_, relux_max_limit_,
DataTypeToEnum<T>::value, &input_shape_, output, future,
return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
strides_.data(), paddings.data(), activation_,
relux_max_limit_, DataTypeToEnum<T>::value,
&input_shape_, output, future,
&kwg_size_, &kernel_error_);
}
......
......@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss;
......@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -24,13 +24,15 @@ namespace kernels {
namespace {
// (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
......@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace
static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
static MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, // NHWC
const Tensor *filter, // HWIM
const Tensor *bias,
......@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
......@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
*prev_input_shape = input->shape();
}
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key =
Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
......@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
strides_, padding_type_, paddings_, dilations_, activation_,
relux_max_limit_)(input, filter, bias, output, future);
LOG(FATAL) << "GPU depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet.";
}
// Create a fake conv_2d filter to calculate the paddings and output size
......@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return DepthwiseConv2d(
context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_);
......
......@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
......@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
input_shape_ = input0->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS;
......
......@@ -22,7 +22,8 @@ namespace kernels {
namespace {
template <typename T>
MaceStatus FCWXKernel(cl::Kernel *kernel,
MaceStatus FCWXKernel(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
......@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
const index_t batch = output->dim(0);
......@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
......@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
}
template <typename T>
MaceStatus FCWTXKernel(cl::Kernel *kernel,
MaceStatus FCWTXKernel(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
......@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error);
OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
......@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws->data(), *lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error);
......@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
&output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
return FCWXKernel<T>(context_,
&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_);
}
......
......@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
}
}
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t cache_size =
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
......@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
return lws;
}
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
......@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
}
return error;
};
OpenCLProfilingTimer timer(&event);
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
OpenCLProfilingTimer timer(runtime, &event);
cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats);
runtime->GetCallStats(event, stats);
}
};
}
return MaceStatus::MACE_SUCCESS;
}
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
......@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
}
return error;
};
OpenCLProfilingTimer timer(&event);
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
OpenCLProfilingTimer timer(runtime, &event);
cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err);
......
......@@ -31,11 +31,11 @@
namespace mace {
namespace kernels {
#define OUT_OF_RANGE_CONFIG(kernel_error) \
#define OUT_OF_RANGE_CONFIG(kernel_error, context) \
if (runtime->IsOutOfRangeCheckEnabled()) { \
built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
(kernel_error) = std::move(std::unique_ptr<Buffer>( \
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \
new Buffer((context)->device()->allocator()))); \
MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \
(kernel_error)->Map(nullptr); \
*((kernel_error)->mutable_data<char>()) = 0; \
......@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpCompatibleCLDt(const DataType dt);
// Tuning or Run OpenCL kernel with 3D work group size
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
// Tuning or Run OpenCL kernel with 2D work group size
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
......@@ -162,7 +164,8 @@ std::string Concat(Args... args) {
return ss.str();
}
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size);
} // namespace kernels
} // namespace mace
......
......@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
break;
}
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......
......@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
const index_t width = input->dim(1);
const index_t width_blocks = width / 4;
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
......@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
static_cast<uint32_t>(height_blocks * batch),
};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
......@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -16,6 +16,8 @@
#include <vector>
#include "gtest/gtest.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
......@@ -25,14 +27,15 @@ namespace mace {
namespace kernels {
namespace {
bool BufferToImageOpImpl(Tensor *buffer,
bool BufferToImageOpImpl(OpKernelContext *context,
Tensor *buffer,
Tensor *image,
const std::vector<size_t> &image_shape) {
std::unique_ptr<BufferBase> kernel_error;
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])};
auto runtime = OpenCLRuntime::Global();
auto runtime = context->device()->opencl_runtime();
std::string kernel_name = "in_out_buffer_to_image";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
......@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str());
OUT_OF_RANGE_CONFIG(kernel_error);
OUT_OF_RANGE_CONFIG(kernel_error, context);
NON_UNIFORM_WG_CONFIG;
if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" +
......@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) {
index_t width = 7;
index_t channels = 11;
std::vector<index_t> buffer_shape = {batch, height, width, channels};
GPUContext gpu_context;
std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
Workspace ws;
OpKernelContext context(&ws, device.get());
std::vector<index_t> buffer_shape = {batch, height, width, channels};
Tensor *buffer =
ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU),
ws.CreateTensor("Buffer", device->allocator(),
DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape);
std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU),
Tensor *image = ws.CreateTensor("Image", device->allocator(),
DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape);
ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));
std::vector<size_t> overflow_image_shape = image_shape;
for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
overflow_image_shape[i] += 1;
}
ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
ASSERT_TRUE(BufferToImageOpImpl(&context,
buffer,
image,
overflow_image_shape));
}
} // namespace kernels
......
......@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name);
......@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -23,13 +23,15 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] =
......@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name);
......@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
};
}
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws.data(), lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels};
......@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name);
......
......@@ -23,9 +23,11 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
uint64_t cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
......@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
......@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -23,13 +23,15 @@ namespace mace {
namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) {
......@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
......@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape();
}
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -24,13 +24,15 @@ namespace kernels {
namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1;
} else {
uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) {
......@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name);
......@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
input_shape_ = logits->shape();
}
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
space_shape_ = space_tensor->shape();
}
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
output_list[i]->ResizeImage(output_shape, image_shape));
}
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name);
......@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)),
};
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event;
CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) {
......
......@@ -24,12 +24,12 @@ namespace kernels {
template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
......@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(0),
output_tensor->dim(1),
output_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor*> &inputs,
Tensor *output_tensor,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto runtime = context_->device()->opencl_runtime();
const Tensor *input_tensor = inputs[0];
const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
......@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name;
std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_);
OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) {
obfuscated_kernel_name =
......@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key,
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_);
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -29,10 +30,13 @@
namespace mace {
namespace kernels {
struct PadFunctorBase {
PadFunctorBase(const std::vector<int> &paddings,
struct PadFunctorBase : OpKernel {
PadFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value)
: paddings_(paddings), constant_value_(constant_value) {}
: OpKernel(context),
paddings_(paddings),
constant_value_(constant_value) {}
std::vector<int> paddings_;
float constant_value_;
......@@ -40,9 +44,10 @@ struct PadFunctorBase {
template<DeviceType D, typename T>
struct PadFunctor : public PadFunctorBase {
PadFunctor(const std::vector<int> &paddings,
PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value)
: PadFunctorBase(paddings, constant_value) {}
: PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
PadFunctor(const std::vector<int> &paddings,
PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value)
: PadFunctorBase(paddings, constant_value) {}
: PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......
......@@ -23,6 +23,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/kernel.h"
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
......@@ -41,14 +42,16 @@ enum PoolingType {
namespace kernels {
struct PoolingFunctorBase {
PoolingFunctorBase(const PoolingType pooling_type,
struct PoolingFunctorBase : OpKernel {
PoolingFunctorBase(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: pooling_type_(pooling_type),
: OpKernel(context),
pooling_type_(pooling_type),
kernels_(kernels),
strides_(strides),
padding_type_(padding_type),
......@@ -68,14 +71,20 @@ struct PoolingFunctor;
template <>
struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
: PoolingFunctorBase(context,
pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
}
void MaxPooling(const float *input,
......@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
template <>
struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
: PoolingFunctorBase(context,
pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {}
void MaxPooling(const uint8_t *input,
const index_t *in_shape,
......@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
: PoolingFunctorBase(context,
pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
}
MaceStatus operator()(const Tensor *input_tensor,
Tensor *output_tensor,
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr,
template<DeviceType D, typename T>
struct ProposalFunctor {
ProposalFunctor(const int min_size,
struct ProposalFunctor : OpKernel {
ProposalFunctor(OpKernelContext *context,
const int min_size,
const float nms_thresh,
const int pre_nms_top_n,
const int post_nms_top_n,
......@@ -130,6 +132,7 @@ struct ProposalFunctor {
const int base_size,
const std::vector<int> &scales,
const std::vector<float> &ratios) :
OpKernel(context),
min_size_(min_size),
thresh_(nms_thresh),
pre_nms_top_n_(pre_nms_top_n),
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
namespace mace {
namespace kernels {
......@@ -173,8 +174,8 @@ template<DeviceType D, typename T>
struct QuantizeFunctor;
template<>
struct QuantizeFunctor<CPU, uint8_t> {
QuantizeFunctor() {}
struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input,
const bool non_zero,
......@@ -212,8 +213,8 @@ template<DeviceType D, typename T>
struct DequantizeFunctor;
template<>
struct DequantizeFunctor<CPU, uint8_t> {
DequantizeFunctor() {}
struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......
......@@ -24,6 +24,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif
......@@ -31,10 +32,12 @@
namespace mace {
namespace kernels {
struct ReduceFunctorBase {
ReduceFunctorBase(const std::vector<int> &axis,
struct ReduceFunctorBase : OpKernel {
ReduceFunctorBase(OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims)
: keep_dims_(keep_dims),
: OpKernel(context),
keep_dims_(keep_dims),
axis_(axis) {}
bool keep_dims_;
bool reduce_first_axis_;
......@@ -44,10 +47,11 @@ struct ReduceFunctorBase {
};
template <DeviceType D, typename T>
struct ReduceMeanFunctor : ReduceFunctorBase{
ReduceMeanFunctor(const std::vector<int> &axis,
struct ReduceMeanFunctor : ReduceFunctorBase {
ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims)
: ReduceFunctorBase(axis, keep_dims) {}
: ReduceFunctorBase(context, axis, keep_dims) {}
void Simplify(const Tensor *input) {
std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
......@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{
template <typename T>
struct ReduceMeanFunctor<DeviceType::GPU, T>
: ReduceFunctorBase {
ReduceMeanFunctor(const std::vector<int> axis,
ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> axis,
const bool keep_dims)
: ReduceFunctorBase(axis, keep_dims) {}
: ReduceFunctorBase(context, axis, keep_dims) {}
MaceStatus operator()(const Tensor *input,
Tensor *output_tensor,
......
......@@ -19,17 +19,14 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
#include "mace/kernels/kernel.h"
namespace mace {
namespace kernels {
template <DeviceType D, typename T>
struct ReshapeFunctor {
ReshapeFunctor() {}
struct ReshapeFunctor : OpKernel {
explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input,
const std::vector<index_t> &out_shape,
......
......@@ -21,6 +21,7 @@
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -136,10 +137,11 @@ inline void ResizeImage(const float *images,
}
}
struct ResizeBicubicFunctorBase {
ResizeBicubicFunctorBase(const std::vector<index_t> &size,
bool align_corners)
: align_corners_(align_corners) {
struct ResizeBicubicFunctorBase : OpKernel {
ResizeBicubicFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: OpKernel(context), align_corners_(align_corners) {
MACE_CHECK(size.size() == 2);
out_height_ = size[0];
out_width_ = size[1];
......@@ -157,8 +159,10 @@ struct ResizeBicubicFunctor;
template<>
struct ResizeBicubicFunctor<DeviceType::CPU, float>
: ResizeBicubicFunctorBase {
ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBicubicFunctorBase(size, align_corners) {}
ResizeBicubicFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: ResizeBicubicFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......@@ -203,8 +207,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
template<typename T>
struct ResizeBicubicFunctor<DeviceType::GPU, T>
: ResizeBicubicFunctorBase {
ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners)
: ResizeBicubicFunctorBase(size, align_corners) {}
ResizeBicubicFunctor(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners)
: ResizeBicubicFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input,
Tensor *output,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册