提交 a992621c 编写于 作者: L liuqi 提交者: 赵奇可

Refactor configuration APIs and Remove some global static variables.

上级 a7ff559c
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/benchmark/statistics.h" #include "mace/benchmark/statistics.h"
...@@ -257,36 +256,40 @@ int Main(int argc, char **argv) { ...@@ -257,36 +256,40 @@ int Main(int argc, char **argv) {
mace::DeviceType device_type = ParseDeviceType(FLAGS_device); mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime // configuration
MaceStatus ret = mace::SetOpenMPThreadPolicy( MaceStatus mace_status;
MaceEngineConfig config(device_type);
mace_status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy), static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
true); true);
if (ret != MACE_SUCCESS) { if (mace_status != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(INFO) << "Set openmp or cpu affinity failed.";
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) { if (device_type == DeviceType::GPU) {
mace::SetGPUHints( // DO NOT USE tmp directory.
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint), // Please use APP's own directory and make sure the directory exists.
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string storage_path =
std::string(storage_path_ptr == nullptr ?
"/data/local/tmp/mace_run/interior" : storage_path_ptr);
std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file}; std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
mace::SetOpenCLBinaryPaths(opencl_binary_paths);
mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); gpu_context = GPUContextBuilder()
.SetStoragePath(storage_path)
.SetOpenCLBinaryPaths(opencl_binary_paths)
.SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
.Finalize();
config.SetGPUContext(gpu_context);
config.SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} }
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string kernel_file_path =
std::string(kernel_path == nullptr ?
"/data/local/tmp/mace_run/interior" : kernel_path);
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(kernel_file_path));
SetKVStorageFactory(storage_factory);
// Create Engine // Create Engine
std::shared_ptr<mace::MaceEngine> engine; std::shared_ptr<mace::MaceEngine> engine;
MaceStatus create_engine_status; MaceStatus create_engine_status;
...@@ -306,7 +309,7 @@ int Main(int argc, char **argv) { ...@@ -306,7 +309,7 @@ int Main(int argc, char **argv) {
model_data_file_ptr, model_data_file_ptr,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#else #else
create_engine_status = create_engine_status =
...@@ -314,7 +317,7 @@ int Main(int argc, char **argv) { ...@@ -314,7 +317,7 @@ int Main(int argc, char **argv) {
model_data_file_ptr, model_data_file_ptr,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#endif #endif
if (create_engine_status != MaceStatus::MACE_SUCCESS) { if (create_engine_status != MaceStatus::MACE_SUCCESS) {
......
...@@ -13,30 +13,12 @@ ...@@ -13,30 +13,12 @@
// limitations under the License. // limitations under the License.
#include "mace/core/allocator.h" #include "mace/core/allocator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_allocator.h"
#endif
namespace mace { namespace mace {
std::map<int32_t, Allocator *> *gAllocatorRegistry() { Allocator *GetCPUAllocator() {
static std::map<int32_t, Allocator *> g_allocator_registry; static CPUAllocator allocator;
return &g_allocator_registry; return &allocator;
} }
Allocator *GetDeviceAllocator(DeviceType type) {
auto iter = gAllocatorRegistry()->find(type);
if (iter == gAllocatorRegistry()->end()) {
LOG(ERROR) << "Allocator not found for device " << type;
return nullptr;
}
return iter->second;
}
MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator());
#endif
MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator());
} // namespace mace } // namespace mace
...@@ -26,8 +26,6 @@ ...@@ -26,8 +26,6 @@
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/core/runtime_failure_mock.h" #include "mace/core/runtime_failure_mock.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
namespace mace { namespace mace {
...@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator { ...@@ -138,26 +136,8 @@ class CPUAllocator : public Allocator {
bool OnHost() const override { return true; } bool OnHost() const override { return true; }
}; };
std::map<int32_t, Allocator *> *gAllocatorRegistry(); // Global CPU allocator used for CPU/GPU/DSP
Allocator *GetCPUAllocator();
Allocator *GetDeviceAllocator(DeviceType type);
struct AllocatorRegisterer {
explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) {
if (gAllocatorRegistry()->count(type)) {
LOG(ERROR) << "Allocator for device type " << type
<< " registered twice. This should not happen."
<< gAllocatorRegistry()->count(type);
std::exit(1);
}
gAllocatorRegistry()->emplace(type, alloc);
}
};
#define MACE_REGISTER_ALLOCATOR(type, alloc) \
namespace { \
static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \
}
} // namespace mace } // namespace mace
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <vector> #include <vector>
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
......
...@@ -218,9 +218,9 @@ class Buffer : public BufferBase { ...@@ -218,9 +218,9 @@ class Buffer : public BufferBase {
class Image : public BufferBase { class Image : public BufferBase {
public: public:
Image() explicit Image(Allocator *allocator)
: BufferBase(0), : BufferBase(0),
allocator_(GetDeviceAllocator(GPU)), allocator_(allocator),
buf_(nullptr), buf_(nullptr),
mapped_buf_(nullptr) {} mapped_buf_(nullptr) {}
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/device.h"
namespace mace {
CPUDevice::CPUDevice(const int num_threads)
: cpu_runtime_(new CPURuntime(num_threads)) {}
CPUDevice::~CPUDevice() = default;
CPURuntime *CPUDevice::cpu_runtime() {
return cpu_runtime_.get();
}
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *CPUDevice::opencl_runtime() {
return nullptr;
}
#endif
Allocator *CPUDevice::allocator() {
return GetCPUAllocator();
}
DeviceType CPUDevice::device_type() const {
return DeviceType::CPU;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_DEVICE_H_
#define MACE_CORE_DEVICE_H_
#include <memory>
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/allocator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
namespace mace {
class Device {
public:
virtual ~Device() {}
#ifdef MACE_ENABLE_OPENCL
virtual OpenCLRuntime *opencl_runtime() = 0;
#endif
virtual CPURuntime *cpu_runtime() = 0;
virtual Allocator *allocator() = 0;
virtual DeviceType device_type() const = 0;
};
class CPUDevice : public Device {
public:
explicit CPUDevice(const int num_threads);
virtual ~CPUDevice();
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime *opencl_runtime() override;
#endif
CPURuntime *cpu_runtime() override;
Allocator *allocator() override;
DeviceType device_type() const override;
private:
std::unique_ptr<CPURuntime> cpu_runtime_;
};
} // namespace mace
#endif // MACE_CORE_DEVICE_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/device_context.h"
#include <sys/stat.h>
namespace mace {
namespace {
const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin";
std::string FindFirstExistPath(const std::vector<std::string> &paths) {
std::string result;
struct stat st;
for (auto path : paths) {
if (stat(path.c_str(), &st) == 0) {
if (S_ISREG(st.st_mode)) {
result = path;
break;
}
}
}
return result;
}
} // namespace
GPUContext::GPUContext(const std::string &storage_path,
const std::vector<std::string> &opencl_binary_paths,
const std::string &opencl_parameter_path)
: storage_factory_(new FileStorageFactory(storage_path)),
opencl_tuner_(new Tuner<uint32_t>(opencl_parameter_path)) {
if (!storage_path.empty()) {
opencl_cache_storage_ =
storage_factory_->CreateStorage(kPrecompiledProgramFileName);
}
std::string precompiled_binary_path =
FindFirstExistPath(opencl_binary_paths);
if (!precompiled_binary_path.empty()) {
opencl_binary_storage_.reset(
new FileStorage(precompiled_binary_path));
}
}
GPUContext::~GPUContext() = default;
KVStorage *GPUContext::opencl_binary_storage() {
return opencl_binary_storage_.get();
}
KVStorage *GPUContext::opencl_cache_storage() {
return opencl_cache_storage_.get();
}
Tuner<uint32_t> *GPUContext::opencl_tuner() {
return opencl_tuner_.get();
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_DEVICE_CONTEXT_H_
#define MACE_CORE_DEVICE_CONTEXT_H_
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "mace/core/file_storage.h"
#include "mace/utils/tuner.h"
namespace mace {
class GPUContext {
public:
GPUContext(const std::string &storage_path = "",
const std::vector<std::string> &opencl_binary_path = {},
const std::string &opencl_parameter_path = "");
~GPUContext();
KVStorage *opencl_binary_storage();
KVStorage *opencl_cache_storage();
Tuner<uint32_t> *opencl_tuner();
private:
std::unique_ptr<KVStorageFactory> storage_factory_;
std::unique_ptr<Tuner<uint32_t>> opencl_tuner_;
std::unique_ptr<KVStorage> opencl_binary_storage_;
std::unique_ptr<KVStorage> opencl_cache_storage_;
};
} // namespace mace
#endif // MACE_CORE_DEVICE_CONTEXT_H_
...@@ -28,10 +28,36 @@ ...@@ -28,10 +28,36 @@
namespace mace { namespace mace {
std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr; class FileStorageFactory::Impl {
public:
explicit Impl(const std::string &path);
std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
private:
std::string path_;
};
FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
const std::string &name) {
return std::move(std::unique_ptr<KVStorage>(
new FileStorage(path_ + "/" + name)));
}
FileStorageFactory::FileStorageFactory(const std::string &path):
impl_(new FileStorageFactory::Impl(path)) {}
FileStorageFactory::~FileStorageFactory() = default;
std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
const std::string &name) {
return impl_->CreateStorage(name);
}
FileStorage::FileStorage(const std::string &file_path): FileStorage::FileStorage(const std::string &file_path):
data_changed_(false), file_path_(file_path) {} loaded_(false), data_changed_(false), file_path_(file_path) {}
int FileStorage::Load() { int FileStorage::Load() {
struct stat st; struct stat st;
...@@ -47,6 +73,9 @@ int FileStorage::Load() { ...@@ -47,6 +73,9 @@ int FileStorage::Load() {
} }
} }
utils::WriteLock lock(&data_mutex_); utils::WriteLock lock(&data_mutex_);
if (loaded_) {
return 0;
}
int fd = open(file_path_.c_str(), O_RDONLY); int fd = open(file_path_.c_str(), O_RDONLY);
if (fd < 0) { if (fd < 0) {
if (errno == ENOENT) { if (errno == ENOENT) {
...@@ -118,13 +147,17 @@ int FileStorage::Load() { ...@@ -118,13 +147,17 @@ int FileStorage::Load() {
<< " failed, error code: " << strerror(errno); << " failed, error code: " << strerror(errno);
return -1; return -1;
} }
loaded_ = true;
return 0; return 0;
} }
void FileStorage::Clear() { bool FileStorage::Clear() {
utils::WriteLock lock(&data_mutex_); utils::WriteLock lock(&data_mutex_);
data_.clear(); if (!data_.empty()) {
data_changed_ = true; data_.clear();
data_changed_ = true;
}
return true;
} }
bool FileStorage::Insert(const std::string &key, bool FileStorage::Insert(const std::string &key,
......
...@@ -16,27 +16,64 @@ ...@@ -16,27 +16,64 @@
#define MACE_CORE_FILE_STORAGE_H_ #define MACE_CORE_FILE_STORAGE_H_
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/public/mace_runtime.h" #include "mace/public/mace.h"
#include "mace/utils/rwlock.h" #include "mace/utils/rwlock.h"
namespace mace { namespace mace {
class KVStorage {
public:
// return: 0 for success, -1 for error
virtual int Load() = 0;
virtual bool Clear() = 0;
// insert or update the key-value.
virtual bool Insert(const std::string &key,
const std::vector<unsigned char> &value) = 0;
virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
// return: 0 for success, -1 for error
virtual int Flush() = 0;
virtual ~KVStorage() {}
};
class KVStorageFactory {
public:
virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
virtual ~KVStorageFactory() {}
};
class FileStorageFactory : public KVStorageFactory {
public:
// You have to make sure your APP have read and write permission of the path.
explicit FileStorageFactory(const std::string &path);
~FileStorageFactory();
std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
class FileStorage : public KVStorage { class FileStorage : public KVStorage {
public: public:
explicit FileStorage(const std::string &file_path); explicit FileStorage(const std::string &file_path);
public: public:
int Load() override; int Load() override;
void Clear() override; bool Clear() override;
bool Insert(const std::string &key, bool Insert(const std::string &key,
const std::vector<unsigned char> &value) override; const std::vector<unsigned char> &value) override;
const std::vector<unsigned char> *Find(const std::string &key) override; const std::vector<unsigned char> *Find(const std::string &key) override;
int Flush() override; int Flush() override;
private: private:
bool loaded_;
bool data_changed_; bool data_changed_;
std::string file_path_; std::string file_path_;
std::map<std::string, std::vector<unsigned char>> data_; std::map<std::string, std::vector<unsigned char>> data_;
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/public/mace.h"
#include "mace/utils/memory_logging.h" #include "mace/utils/memory_logging.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -27,30 +28,35 @@ namespace mace { ...@@ -27,30 +28,35 @@ namespace mace {
NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry, NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type) Device *device)
: name_(net_def->name()), op_registry_(op_registry) { : name_(net_def->name()), op_registry_(op_registry) {
MACE_UNUSED(ws); MACE_UNUSED(ws);
MACE_UNUSED(type); MACE_UNUSED(device);
} }
SerialNet::SerialNet( SerialNet::SerialNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry, const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode) const NetMode mode)
: NetBase(op_registry, net_def, ws, type), device_type_(type) { : NetBase(op_registry, net_def, ws, device), device_(device),
op_kernel_context_(new OpKernelContext(ws, device)) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
DeviceType device_type = device->device_type();
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); const auto &operator_def = net_def->op(idx);
// TODO(liuqi): refactor to add device_type to OperatorDef // TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device = const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(device_type_)); operator_def, "device", static_cast<int>(device_type));
if (op_device == type) { if (op_device == device_type) {
VLOG(3) << "Creating operator " << operator_def.name() << "("
<< operator_def.type() << ")";
OperatorDef temp_def(operator_def); OperatorDef temp_def(operator_def);
std::unique_ptr<OperatorBase> op( std::unique_ptr<OperatorBase> op(
op_registry->CreateOperator(temp_def, ws, type, mode)); op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
device_type, mode));
if (op) { if (op) {
operators_.emplace_back(std::move(op)); operators_.emplace_back(std::move(op));
} }
...@@ -61,13 +67,14 @@ SerialNet::SerialNet( ...@@ -61,13 +67,14 @@ SerialNet::SerialNet(
MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
MACE_MEMORY_LOGGING_GUARD(); MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
const DeviceType device_type = device_->device_type();
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter; auto &op = *iter;
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
op->debug_def().type(), "), mem_id: ", op->debug_def().type(), "), mem_id: ",
MakeListString(op->debug_def().mem_id().data(), MakeListString(op->debug_def().mem_id().data(),
op->debug_def().mem_id().size())); op->debug_def().mem_id().size()));
bool future_wait = (device_type_ == DeviceType::GPU && bool future_wait = (device_type == DeviceType::GPU &&
(run_metadata != nullptr || (run_metadata != nullptr ||
std::distance(iter, operators_.end()) == 1)); std::distance(iter, operators_.end()) == 1));
...@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
} else { } else {
future.wait_fn(nullptr); future.wait_fn(nullptr);
} }
#ifdef MACE_ENABLE_OPENCL
device_->opencl_runtime()->command_queue().finish();
#endif
} else if (run_metadata != nullptr) { } else if (run_metadata != nullptr) {
call_stats.start_micros = NowMicros(); call_stats.start_micros = NowMicros();
MACE_RETURN_IF_ERROR(op->Run(nullptr)); MACE_RETURN_IF_ERROR(op->Run(nullptr));
...@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
VLOG(3) << "Operator " << op->debug_def().name() VLOG(3) << "Operator " << op->debug_def().name()
<< " has shape: " << MakeString(op->Output(0)->shape()); << " has shape: " << MakeString(op->Output(0)->shape());
if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) { if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) {
for (int i = 0; i < op->OutputSize(); ++i) { for (int i = 0; i < op->OutputSize(); ++i) {
int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT)); int data_type = op->GetOptionalArg("T", static_cast<int>(DT_FLOAT));
if (data_type == static_cast<int>(DT_FLOAT)) { if (data_type == static_cast<int>(DT_FLOAT)) {
...@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet( ...@@ -151,20 +161,20 @@ std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry, const std::shared_ptr<const OperatorRegistryBase> op_registry,
const NetDef &net_def, const NetDef &net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode) { const NetMode mode) {
std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def)); std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
return CreateNet(op_registry, tmp_net_def, ws, type, mode); return CreateNet(op_registry, tmp_net_def, ws, device, mode);
} }
std::unique_ptr<NetBase> CreateNet( std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry, const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode) { const NetMode mode) {
std::unique_ptr<NetBase> net( std::unique_ptr<NetBase> net(
new SerialNet(op_registry, net_def, ws, type, mode)); new SerialNet(op_registry, net_def, ws, device, mode));
return net; return net;
} }
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -33,7 +32,7 @@ class NetBase { ...@@ -33,7 +32,7 @@ class NetBase {
NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry, NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type); Device *device);
virtual ~NetBase() noexcept {} virtual ~NetBase() noexcept {}
virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0; virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
...@@ -52,14 +51,15 @@ class SerialNet : public NetBase { ...@@ -52,14 +51,15 @@ class SerialNet : public NetBase {
SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry, SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode = NetMode::NORMAL); const NetMode mode = NetMode::NORMAL);
MaceStatus Run(RunMetadata *run_metadata = nullptr) override; MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
protected: protected:
std::vector<std::unique_ptr<OperatorBase> > operators_; std::vector<std::unique_ptr<OperatorBase> > operators_;
DeviceType device_type_; Device *device_;
std::unique_ptr<OpKernelContext> op_kernel_context_;
MACE_DISABLE_COPY_AND_ASSIGN(SerialNet); MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
}; };
...@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet( ...@@ -68,13 +68,13 @@ std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry, const std::shared_ptr<const OperatorRegistryBase> op_registry,
const NetDef &net_def, const NetDef &net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode = NetMode::NORMAL); const NetMode mode = NetMode::NORMAL);
std::unique_ptr<NetBase> CreateNet( std::unique_ptr<NetBase> CreateNet(
const std::shared_ptr<const OperatorRegistryBase> op_registry, const std::shared_ptr<const OperatorRegistryBase> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, Device *device,
const NetMode mode = NetMode::NORMAL); const NetMode mode = NetMode::NORMAL);
} // namespace mace } // namespace mace
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/op_kernel_context.h"
namespace mace {
OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
: device_(device), ws_(ws) {}
OpKernelContext::~OpKernelContext() = default;
Device* OpKernelContext::device() {
return device_;
}
Workspace* OpKernelContext::workspace() {
return ws_;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
#define MACE_CORE_OP_KERNEL_CONTEXT_H_
#include "mace/core/device.h"
#include "mace/core/workspace.h"
namespace mace {
class OpKernelContext {
public:
OpKernelContext(Workspace *ws, Device *device);
~OpKernelContext();
Device *device();
Workspace *workspace();
private:
Device *device_;
Workspace *ws_;
};
} // namespace mace
#endif // MACE_CORE_OP_KERNEL_CONTEXT_H_
...@@ -18,12 +18,15 @@ ...@@ -18,12 +18,15 @@
#include <vector> #include <vector>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/core/op_kernel_context.h"
namespace mace { namespace mace {
OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) OperatorBase::OperatorBase(const OperatorDef &operator_def,
: operator_ws_(ws), OpKernelContext *context)
operator_def_(std::make_shared<OperatorDef>(operator_def)) {} : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
MACE_UNUSED(context);
}
OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {} OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
...@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {} ...@@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {}
std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator( std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
const OperatorDef &operator_def, const OperatorDef &operator_def,
Workspace *ws, OpKernelContext *context,
DeviceType type, DeviceType type,
const NetMode mode) const { const NetMode mode) const {
const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
...@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator( ...@@ -70,7 +73,7 @@ std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
.Device(type) .Device(type)
.TypeConstraint("T", static_cast<DataType>(dtype)) .TypeConstraint("T", static_cast<DataType>(dtype))
.Build(), .Build(),
operator_def, ws); operator_def, context);
} else { } else {
return nullptr; return nullptr;
} }
......
...@@ -22,17 +22,17 @@ ...@@ -22,17 +22,17 @@
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/public/mace.h"
namespace mace { namespace mace {
class OperatorBase { class OperatorBase {
public: public:
explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws); explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
virtual ~OperatorBase() noexcept {} virtual ~OperatorBase() noexcept {}
template <typename T> template <typename T>
...@@ -78,7 +78,6 @@ class OperatorBase { ...@@ -78,7 +78,6 @@ class OperatorBase {
inline bool has_debug_def() const { return operator_def_ != nullptr; } inline bool has_debug_def() const { return operator_def_ != nullptr; }
protected: protected:
Workspace *operator_ws_;
std::shared_ptr<const OperatorDef> operator_def_; std::shared_ptr<const OperatorDef> operator_def_;
std::vector<const Tensor *> inputs_; std::vector<const Tensor *> inputs_;
std::vector<Tensor *> outputs_; std::vector<Tensor *> outputs_;
...@@ -89,8 +88,9 @@ class OperatorBase { ...@@ -89,8 +88,9 @@ class OperatorBase {
template <DeviceType D, class T> template <DeviceType D, class T>
class Operator : public OperatorBase { class Operator : public OperatorBase {
public: public:
explicit Operator(const OperatorDef &operator_def, Workspace *ws) explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
: OperatorBase(operator_def, ws) { : OperatorBase(operator_def, context) {
Workspace *ws = context->workspace();
for (const std::string &input_str : operator_def.input()) { for (const std::string &input_str : operator_def.input()) {
const Tensor *tensor = ws->GetTensor(input_str); const Tensor *tensor = ws->GetTensor(input_str);
MACE_CHECK(tensor != nullptr, "op ", operator_def.type(), MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
...@@ -116,7 +116,7 @@ class Operator : public OperatorBase { ...@@ -116,7 +116,7 @@ class Operator : public OperatorBase {
output_type = DataTypeToEnum<T>::v(); output_type = DataTypeToEnum<T>::v();
} }
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, GetDeviceAllocator(D), output_type))); output_str, context->device()->allocator(), output_type)));
} }
} }
} }
...@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { ...@@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
class OperatorRegistryBase { class OperatorRegistryBase {
public: public:
typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *> typedef Registry<std::string,
OperatorBase,
const OperatorDef &,
OpKernelContext *>
RegistryType; RegistryType;
OperatorRegistryBase() = default; OperatorRegistryBase() = default;
virtual ~OperatorRegistryBase(); virtual ~OperatorRegistryBase();
RegistryType *registry() { return &registry_; } RegistryType *registry() { return &registry_; }
std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def, std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
Workspace *ws, OpKernelContext *context,
DeviceType type, DeviceType type,
const NetMode mode) const; const NetMode mode) const;
...@@ -183,7 +186,7 @@ class OperatorRegistryBase { ...@@ -183,7 +186,7 @@ class OperatorRegistryBase {
MACE_DECLARE_REGISTRY(OpRegistry, MACE_DECLARE_REGISTRY(OpRegistry,
OperatorBase, OperatorBase,
const OperatorDef &, const OperatorDef &,
Workspace *); OpKernelContext *);
#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
......
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/public/mace.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
namespace mace { namespace mace {
......
...@@ -30,7 +30,6 @@ ...@@ -30,7 +30,6 @@
#include "public/gemmlowp.h" #include "public/gemmlowp.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
namespace mace { namespace mace {
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <vector> #include <vector>
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
namespace mace { namespace mace {
...@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, ...@@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
bool use_gemmlowp = false); bool use_gemmlowp = false);
class CPURuntime {
public:
explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
~CPURuntime() = default;
inline int num_threads() const {
return num_threads_;
}
private:
int num_threads_;
};
} // namespace mace } // namespace mace
#endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/gpu_device.h"
namespace mace {
GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
KVStorage *opencl_cache_storage,
const GPUPriorityHint priority,
const GPUPerfHint perf,
KVStorage *opencl_binary_storage,
const int num_threads) :
CPUDevice(num_threads),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {}
GPUDevice::~GPUDevice() = default;
OpenCLRuntime* GPUDevice::opencl_runtime() {
return runtime_.get();
}
Allocator* GPUDevice::allocator() {
return allocator_.get();
}
DeviceType GPUDevice::device_type() const {
return DeviceType::GPU;
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
#include <memory>
#include "mace/core/device_context.h"
#include "mace/core/device.h"
#include "mace/core/runtime/opencl/opencl_allocator.h"
namespace mace {
class GPUDevice : public CPUDevice {
public:
GPUDevice(Tuner<uint32_t> *tuner,
KVStorage *opencl_cache_storage = nullptr,
const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
KVStorage *opencl_binary_storage = nullptr,
const int num_threads = -1);
~GPUDevice();
OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override;
DeviceType device_type() const override;
private:
std::unique_ptr<OpenCLRuntime> runtime_;
std::unique_ptr<OpenCLAllocator> allocator_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_
...@@ -12,8 +12,9 @@ ...@@ -12,8 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <memory>
#include "mace/core/runtime/opencl/opencl_allocator.h" #include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
...@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { ...@@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
} }
} // namespace } // namespace
OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::OpenCLAllocator(
OpenCLRuntime *opencl_runtime):
opencl_runtime_(opencl_runtime) {}
OpenCLAllocator::~OpenCLAllocator() {} OpenCLAllocator::~OpenCLAllocator() {}
MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
...@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { ...@@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
} }
cl_int error; cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
nbytes, nullptr, &error); nbytes, nullptr, &error);
if (error != CL_SUCCESS) { if (error != CL_SUCCESS) {
...@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
cl_int error; cl_int error;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Global()->context(), new cl::Image2D(opencl_runtime_->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
image_shape[0], image_shape[1], 0, nullptr, &error); image_shape[0], image_shape[1], 0, nullptr, &error);
if (error != CL_SUCCESS) { if (error != CL_SUCCESS) {
...@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const { ...@@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const {
} }
void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
VLOG(3) << "Map OpenCL buffer";
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue(); auto queue = opencl_runtime_->command_queue();
// TODO(heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
...@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { ...@@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
void *OpenCLAllocator::MapImage(void *buffer, void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch) const { std::vector<size_t> *mapped_image_pitch) const {
MACE_CHECK(image_shape.size() == 2, "Just support map 2d image"); VLOG(3) << "Map OpenCL Image";
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer); auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0}; std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1}; std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch->resize(2); mapped_image_pitch->resize(2);
cl_int error; cl_int error;
void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage( void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
nullptr, &error); nullptr, &error);
...@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer, ...@@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer,
} }
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
VLOG(3) << "Unmap OpenCL buffer/Image";
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue(); auto queue = opencl_runtime_->command_queue();
cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
nullptr, nullptr); nullptr, nullptr);
if (error != CL_SUCCESS) { if (error != CL_SUCCESS) {
......
...@@ -15,15 +15,17 @@ ...@@ -15,15 +15,17 @@
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
#include <memory>
#include <vector> #include <vector>
#include "mace/core/allocator.h" #include "mace/core/allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace { namespace mace {
class OpenCLAllocator : public Allocator { class OpenCLAllocator : public Allocator {
public: public:
OpenCLAllocator(); explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime);
~OpenCLAllocator() override; ~OpenCLAllocator() override;
...@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator { ...@@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator {
void Unmap(void *buffer, void *mapped_ptr) const override; void Unmap(void *buffer, void *mapped_ptr) const override;
bool OnHost() const override; bool OnHost() const override;
private:
OpenCLRuntime *opencl_runtime_;
}; };
} // namespace mace } // namespace mace
......
...@@ -24,11 +24,9 @@ ...@@ -24,11 +24,9 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include "mace/public/mace_runtime.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/core/file_storage.h" #include "mace/core/file_storage.h"
#include "mace/core/runtime/opencl/opencl_extension.h" #include "mace/core/runtime/opencl/opencl_extension.h"
#include "mace/public/mace.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
namespace mace { namespace mace {
...@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) { ...@@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector<std::string> &paths) {
const char *kOpenCLPlatformInfoKey = const char *kOpenCLPlatformInfoKey =
"mace_opencl_precompiled_platform_info_key"; "mace_opencl_precompiled_platform_info_key";
const char *kPrecompiledProgramFileName =
"mace_cl_compiled_program.bin";
} // namespace } // namespace
void OpenCLProfilingTimer::StartTiming() {} void OpenCLProfilingTimer::StartTiming() {}
void OpenCLProfilingTimer::StopTiming() { void OpenCLProfilingTimer::StopTiming() {
OpenCLRuntime::Global()->command_queue().finish(); runtime_->command_queue().finish();
start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>(); start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>(); stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
} }
...@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() { ...@@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() {
accumulated_micros_ = 0; accumulated_micros_ = 0;
} }
GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL; OpenCLRuntime::OpenCLRuntime(
GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = KVStorage *cache_storage,
GPUPriorityHint::PRIORITY_DEFAULT; const GPUPriorityHint priority_hint,
std::string const GPUPerfHint perf_hint,
OpenCLRuntime::kPrecompiledBinaryPath = ""; // NOLINT(runtime/string) KVStorage *precompiled_binary_storage,
Tuner<uint32_t> *tuner):
OpenCLRuntime *OpenCLRuntime::Global() { cache_storage_(cache_storage),
static OpenCLRuntime runtime; precompiled_binary_storage_(precompiled_binary_storage),
return &runtime; tuner_(tuner),
}
void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
GPUPriorityHint gpu_priority_hint) {
OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
}
void OpenCLRuntime::ConfigureOpenCLBinaryPath(
const std::vector<std::string> &paths) {
OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths);
if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) {
LOG(WARNING) << "There is no precompiled OpenCL binary file in "
<< MakeString(paths);
}
}
OpenCLRuntime::OpenCLRuntime():
precompiled_binary_storage_(nullptr),
cache_storage_(nullptr),
is_opencl_avaliable_(false), is_opencl_avaliable_(false),
is_profiling_enabled_(false), is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN), opencl_version_(CL_VER_UNKNOWN),
...@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime():
cl_command_queue_properties properties = 0; cl_command_queue_properties properties = 0;
const char *profiling = getenv("MACE_OPENCL_PROFILING"); const char *profiling = getenv("MACE_OPENCL_PROFILING");
if (Tuner<uint32_t>::Get()->IsTuning() || if (IsTuning() ||
(profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
properties |= CL_QUEUE_PROFILING_ENABLE; properties |= CL_QUEUE_PROFILING_ENABLE;
is_profiling_enabled_ = true; is_profiling_enabled_ = true;
...@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime():
std::vector<cl_context_properties> context_properties; std::vector<cl_context_properties> context_properties;
context_properties.reserve(5); context_properties.reserve(5);
GetAdrenoContextProperties(&context_properties, GetAdrenoContextProperties(&context_properties,
OpenCLRuntime::kGPUPerfHint, perf_hint,
OpenCLRuntime::kGPUPriorityHint); priority_hint);
context_ = std::shared_ptr<cl::Context>( context_ = std::shared_ptr<cl::Context>(
new cl::Context({*device_}, context_properties.data(), new cl::Context({*device_}, context_properties.data(),
nullptr, nullptr, &err)); nullptr, nullptr, &err));
...@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime():
return; return;
} }
extern std::shared_ptr<KVStorageFactory> kStorageFactory;
std::string cached_binary_platform_info; std::string cached_binary_platform_info;
if (kStorageFactory != nullptr) { if (cache_storage_ != nullptr) {
cache_storage_ =
kStorageFactory->CreateStorage(kPrecompiledProgramFileName);
if (cache_storage_->Load() != 0) { if (cache_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. " LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. "
<< "Please make sure the storage directory exist " << "Please make sure the storage directory exist "
...@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime(): ...@@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime():
} }
if (cached_binary_platform_info != platform_info_) { if (cached_binary_platform_info != platform_info_) {
if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) { if (precompiled_binary_storage_ == nullptr) {
precompiled_binary_storage_.reset( VLOG(1) << "There is no precompiled OpenCL binary in"
new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); " all OpenCL binary paths.";
} else {
if (precompiled_binary_storage_->Load() != 0) { if (precompiled_binary_storage_->Load() != 0) {
LOG(WARNING) << "Load OpenCL precompiled kernel file failed. " LOG(WARNING) << "Load OpenCL precompiled kernel file failed. "
<< "Please make sure the storage directory exist " << "Please make sure the storage directory exist "
...@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; } ...@@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
Tuner<uint32_t> *OpenCLRuntime::tuner() { return tuner_; }
uint64_t OpenCLRuntime::device_global_mem_cache_size() const { uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
return device_gloabl_mem_cache_size_; return device_gloabl_mem_cache_size_;
} }
......
...@@ -22,11 +22,12 @@ ...@@ -22,11 +22,12 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "mace/core/file_storage.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
...@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error); ...@@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error);
return MaceStatus::MACE_OUT_OF_RESOURCES; \ return MaceStatus::MACE_OUT_OF_RESOURCES; \
} }
class OpenCLProfilingTimer : public Timer {
public:
explicit OpenCLProfilingTimer(const cl::Event *event)
: event_(event), accumulated_micros_(0) {}
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private:
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
double accumulated_micros_;
};
class OpenCLRuntime { class OpenCLRuntime {
public: public:
static OpenCLRuntime *Global(); OpenCLRuntime(
static void Configure(GPUPerfHint, GPUPriorityHint); KVStorage *cache_storage = nullptr,
static void ConfigureOpenCLBinaryPath(const std::vector<std::string> &paths); const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL,
const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL,
KVStorage *precompiled_binary_storage = nullptr,
Tuner<uint32_t> *tuner = nullptr);
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
cl::Context &context(); cl::Context &context();
cl::Device &device(); cl::Device &device();
...@@ -91,6 +80,7 @@ class OpenCLRuntime { ...@@ -91,6 +80,7 @@ class OpenCLRuntime {
const std::string platform_info() const; const std::string platform_info() const;
uint64_t device_global_mem_cache_size() const; uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const; uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner();
bool is_opencl_avaliable(); bool is_opencl_avaliable();
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
...@@ -112,11 +102,6 @@ class OpenCLRuntime { ...@@ -112,11 +102,6 @@ class OpenCLRuntime {
void SaveBuiltCLProgram(); void SaveBuiltCLProgram();
private: private:
OpenCLRuntime();
~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
bool BuildProgram(const std::string &program_file_name, bool BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name, const std::string &binary_file_name,
const std::string &build_options, const std::string &build_options,
...@@ -137,10 +122,13 @@ class OpenCLRuntime { ...@@ -137,10 +122,13 @@ class OpenCLRuntime {
OpenCLVersion ParseDeviceVersion(const std::string &device_version); OpenCLVersion ParseDeviceVersion(const std::string &device_version);
private: private:
std::unique_ptr<KVStorage> precompiled_binary_storage_; KVStorage *cache_storage_;
std::unique_ptr<KVStorage> cache_storage_; KVStorage *precompiled_binary_storage_;
Tuner<uint32_t> *tuner_;
bool is_opencl_avaliable_; bool is_opencl_avaliable_;
bool is_profiling_enabled_; bool is_profiling_enabled_;
OpenCLVersion opencl_version_;
GPUType gpu_type_;
// All OpenCL object must be a pointer and manually deleted before unloading // All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library. // OpenCL library.
std::shared_ptr<cl::Context> context_; std::shared_ptr<cl::Context> context_;
...@@ -149,18 +137,30 @@ class OpenCLRuntime { ...@@ -149,18 +137,30 @@ class OpenCLRuntime {
std::map<std::string, cl::Program> built_program_map_; std::map<std::string, cl::Program> built_program_map_;
std::mutex program_build_mutex_; std::mutex program_build_mutex_;
std::string platform_info_; std::string platform_info_;
OpenCLVersion opencl_version_;
std::string precompiled_binary_platform_info_; std::string precompiled_binary_platform_info_;
bool out_of_range_check_; bool out_of_range_check_;
uint64_t device_gloabl_mem_cache_size_; uint64_t device_gloabl_mem_cache_size_;
uint32_t device_compute_units_; uint32_t device_compute_units_;
GPUType gpu_type_;
static GPUPerfHint kGPUPerfHint;
static GPUPriorityHint kGPUPriorityHint;
static std::string kPrecompiledBinaryPath;
}; };
class OpenCLProfilingTimer : public Timer {
public:
OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event)
: runtime_(runtime), event_(event), accumulated_micros_(0) {}
void StartTiming() override;
void StopTiming() override;
void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private:
OpenCLRuntime *runtime_;
const cl::Event *event_;
double start_nanos_;
double stop_nanos_;
double accumulated_micros_;
};
} // namespace mace } // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ #endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#endif #endif
#include "mace/public/mace.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
...@@ -38,10 +37,10 @@ ...@@ -38,10 +37,10 @@
namespace mace { namespace mace {
#define MACE_SINGLE_ARG(...) __VA_ARGS__ #define MACE_SINGLE_ARG(...) __VA_ARGS__
#define MACE_CASE(TYPE, STATEMENTS) \ #define MACE_CASE(TYPE, STATEMENTS) \
case DataTypeToEnum<TYPE>::value: { \ case DataTypeToEnum<TYPE>::value: { \
typedef TYPE T; \ typedef TYPE T; \
STATEMENTS; \ STATEMENTS; \
break; \ break; \
} }
...@@ -137,7 +136,7 @@ class Tensor { ...@@ -137,7 +136,7 @@ class Tensor {
buffer_ = &buffer_slice_; buffer_ = &buffer_slice_;
} }
Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {} Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {}
~Tensor() { ~Tensor() {
if (is_buffer_owner_ && buffer_ != nullptr) { if (is_buffer_owner_ && buffer_ != nullptr) {
...@@ -270,7 +269,7 @@ class Tensor { ...@@ -270,7 +269,7 @@ class Tensor {
image_shape_ = image_shape; image_shape_ = image_shape;
if (buffer_ == nullptr) { if (buffer_ == nullptr) {
MACE_CHECK(is_buffer_owner_); MACE_CHECK(is_buffer_owner_);
buffer_ = new Image(); buffer_ = new Image(allocator_);
return buffer_->Allocate(image_shape, dtype_); return buffer_->Allocate(image_shape, dtype_);
} else { } else {
MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
......
...@@ -16,15 +16,10 @@ ...@@ -16,15 +16,10 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
DEFINE_int32(cpu_affinity_policy, 1, DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
...@@ -43,10 +38,6 @@ int main(int argc, char **argv) { ...@@ -43,10 +38,6 @@ int main(int argc, char **argv) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set openmp or cpu affinity failed.";
} }
mace::OpenCLRuntime::Configure(
static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
mace::testing::Benchmark::Run(FLAGS_filter.c_str()); mace::testing::Benchmark::Run(FLAGS_filter.c_str());
return 0; return 0;
} }
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/core/workspace.h"
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
...@@ -21,8 +24,6 @@ ...@@ -21,8 +24,6 @@
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#endif #endif
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
namespace mace { namespace mace {
...@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) { ...@@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
} }
} // namespace } // namespace
Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer( Workspace::Workspace() :
GetDeviceAllocator(DeviceType::CPU))) {} host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {}
Tensor *Workspace::CreateTensor(const std::string &name, Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
...@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const { ...@@ -74,7 +75,7 @@ std::vector<std::string> Workspace::Tensors() const {
} }
MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
DeviceType type, Device *device,
const unsigned char *model_data) { const unsigned char *model_data) {
MACE_LATENCY_LOGGER(1, "Load model tensors"); MACE_LATENCY_LOGGER(1, "Load model tensors");
index_t model_data_size = 0; index_t model_data_size = 0;
...@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} }
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
const DeviceType device_type = device->device_type();
if (model_data_size > 0) { if (model_data_size > 0) {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
if (type == DeviceType::GPU && if (device_type == DeviceType::GPU &&
OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <= device->opencl_runtime()->GetDeviceMaxMemAllocSize() <=
static_cast<uint64_t>(model_data_size)) { static_cast<uint64_t>(model_data_size)) {
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
...@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} }
std::unique_ptr<Tensor> tensor( std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(type), new Tensor(device->allocator(),
const_tensor.data_type(), true)); const_tensor.data_type(), true));
tensor->Resize(dims); tensor->Resize(dims);
...@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
#else #else
{ {
#endif #endif
if (type == DeviceType::CPU) { if (device_type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), new Buffer(device->allocator(),
const_cast<unsigned char*>(model_data), const_cast<unsigned char*>(model_data),
model_data_size)); model_data_size));
} else { } else {
tensor_buffer_ = std::unique_ptr<Buffer>( tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type))); new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr); tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data), tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
...@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} }
} }
if (type == DeviceType::CPU || type == DeviceType::GPU) { if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
MaceStatus status = CreateOutputTensorBuffer(net_def, type); MaceStatus status = CreateOutputTensorBuffer(net_def, device);
if (status != MaceStatus::MACE_SUCCESS) return status; if (status != MaceStatus::MACE_SUCCESS) return status;
} }
if (type == DeviceType::CPU && net_def.has_quantize_info()) { if (device_type == DeviceType::CPU && net_def.has_quantize_info()) {
for (const auto for (const auto
&activation_info: net_def.quantize_info().activation_info()) { &activation_info: net_def.quantize_info().activation_info()) {
if (HasTensor(activation_info.tensor_name())) { if (HasTensor(activation_info.tensor_name())) {
...@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
} }
MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type) { Device *device) {
DeviceType device_type = device->device_type();
DataType dtype = DataType::DT_INVALID; DataType dtype = DataType::DT_INVALID;
if (net_def.mem_arena().mem_block_size() > 0) { if (net_def.mem_arena().mem_block_size() > 0) {
// We use the data type of the first op with mem id, // We use the data type of the first op with mem id,
...@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
<< ", memory type: " << mem_block.mem_type(); << ", memory type: " << mem_block.mem_type();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::CPU))); new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE)); + MACE_EXTRA_BUFFER_PAD_SIZE));
...@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::move(tensor_buf)); std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image()); new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype)); {mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetDeviceAllocator(DeviceType::GPU))); new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype))); mem_block.x() * GetEnumTypeSize(dtype)));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
...@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
op, "T", static_cast<int>(DT_FLOAT))); op, "T", static_cast<int>(DT_FLOAT)));
} }
CreateTensor(op.output(i), CreateTensor(op.output(i),
GetDeviceAllocator(device_type), device->allocator(),
output_type); output_type);
} }
} }
...@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() { ...@@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() {
} }
void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data) { const unsigned char *model_data,
Allocator *alloc) {
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
auto iter = tensor_map_.find(const_tensor.name()); auto iter = tensor_map_.find(const_tensor.name());
if (iter->second->unused()) { if (iter->second->unused()) {
...@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, ...@@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
dims.push_back(d); dims.push_back(d);
} }
std::unique_ptr<Tensor> tensor( std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(DeviceType::GPU), new Tensor(alloc, const_tensor.data_type()));
const_tensor.data_type()));
tensor->Resize(dims); tensor->Resize(dims);
MACE_CHECK(tensor->size() == const_tensor.data_size(), MACE_CHECK(tensor->size() == const_tensor.data_size(),
"Tensor's data_size not equal with the shape"); "Tensor's data_size not equal with the shape");
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include <memory> #include <memory>
#include "mace/core/device.h"
#include "mace/core/preallocated_pooled_allocator.h" #include "mace/core/preallocated_pooled_allocator.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -48,7 +49,7 @@ class Workspace { ...@@ -48,7 +49,7 @@ class Workspace {
std::vector<std::string> Tensors() const; std::vector<std::string> Tensors() const;
MaceStatus LoadModelTensor(const NetDef &net_def, MaceStatus LoadModelTensor(const NetDef &net_def,
DeviceType type, Device *device,
const unsigned char *model_data); const unsigned char *model_data);
ScratchBuffer *GetScratchBuffer(DeviceType device_type); ScratchBuffer *GetScratchBuffer(DeviceType device_type);
...@@ -56,11 +57,14 @@ class Workspace { ...@@ -56,11 +57,14 @@ class Workspace {
void RemoveUnusedBuffer(); void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def, void RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data); const unsigned char *model_data,
Allocator *alloc);
private: private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type); Device *device);
Device *device_;
TensorMap tensor_map_; TensorMap tensor_map_;
......
...@@ -37,15 +37,13 @@ public class AppModel { ...@@ -37,15 +37,13 @@ public class AppModel {
mJniThread = new Handler(thread.getLooper()); mJniThread = new Handler(thread.getLooper());
} }
public void maceMobilenetSetAttrs(final InitData initData) { public void maceMobilenetCreateGPUContext(final InitData initData) {
mJniThread.post(new Runnable() { mJniThread.post(new Runnable() {
@Override @Override
public void run() { public void run() {
int result = JniMaceUtils.maceMobilenetSetAttrs( int result = JniMaceUtils.maceMobilenetCreateGPUContext(
initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(), initData.getStoragePath());
initData.getGpuPerfHint(), initData.getGpuPriorityHint(), Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result);
initData.getKernelPath());
Log.i("APPModel", "maceMobilenetSetAttrs result = " + result);
} }
}); });
} }
...@@ -54,7 +52,10 @@ public class AppModel { ...@@ -54,7 +52,10 @@ public class AppModel {
mJniThread.post(new Runnable() { mJniThread.post(new Runnable() {
@Override @Override
public void run() { public void run() {
int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice()); int result = JniMaceUtils.maceMobilenetCreateEngine(
initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(),
initData.getGpuPerfHint(), initData.getGpuPriorityHint(),
initData.getModel(), initData.getDevice());
Log.i("APPModel", "maceMobilenetCreateEngine result = " + result); Log.i("APPModel", "maceMobilenetCreateEngine result = " + result);
if (result == -1) { if (result == -1) {
......
...@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap ...@@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap
} }
private void initJni() { private void initJni() {
AppModel.instance.maceMobilenetSetAttrs(initData); AppModel.instance.maceMobilenetCreateGPUContext(initData);
AppModel.instance.maceMobilenetCreateEngine(initData, this); AppModel.instance.maceMobilenetCreateEngine(initData, this);
} }
......
...@@ -29,7 +29,7 @@ public class InitData { ...@@ -29,7 +29,7 @@ public class InitData {
private int cpuAffinityPolicy; private int cpuAffinityPolicy;
private int gpuPerfHint; private int gpuPerfHint;
private int gpuPriorityHint; private int gpuPriorityHint;
private String kernelPath = ""; private String storagePath = "";
public InitData() { public InitData() {
model = MODELS[0]; model = MODELS[0];
...@@ -38,8 +38,8 @@ public class InitData { ...@@ -38,8 +38,8 @@ public class InitData {
gpuPerfHint = 3; gpuPerfHint = 3;
gpuPriorityHint = 3; gpuPriorityHint = 3;
device = DEVICES[0]; device = DEVICES[0];
kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace"; storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace";
File file = new File(kernelPath); File file = new File(storagePath);
if (!file.exists()) { if (!file.exists()) {
file.mkdir(); file.mkdir();
} }
...@@ -94,11 +94,11 @@ public class InitData { ...@@ -94,11 +94,11 @@ public class InitData {
this.gpuPriorityHint = gpuPriorityHint; this.gpuPriorityHint = gpuPriorityHint;
} }
public String getKernelPath() { public String getStoragePath() {
return kernelPath; return storagePath;
} }
public void setKernelPath(String kernelPath) { public void setStoragePath(String storagePath) {
this.kernelPath = kernelPath; this.storagePath = storagePath;
} }
} }
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include <numeric> #include <numeric>
#include "src/main/cpp/include/mace/public/mace.h" #include "src/main/cpp/include/mace/public/mace.h"
#include "src/main/cpp/include/mace/public/mace_runtime.h"
#include "src/main/cpp/include/mace/public/mace_engine_factory.h" #include "src/main/cpp/include/mace/public/mace_engine_factory.h"
namespace { namespace {
...@@ -39,8 +38,8 @@ struct ModelInfo { ...@@ -39,8 +38,8 @@ struct ModelInfo {
}; };
struct MaceContext { struct MaceContext {
std::shared_ptr<mace::GPUContext> gpu_context;
std::shared_ptr<mace::MaceEngine> engine; std::shared_ptr<mace::MaceEngine> engine;
std::shared_ptr<mace::KVStorageFactory> storage_factory;
std::string model_name; std::string model_name;
mace::DeviceType device_type = mace::DeviceType::CPU; mace::DeviceType device_type = mace::DeviceType::CPU;
std::map<std::string, ModelInfo> model_infos = { std::map<std::string, ModelInfo> model_infos = {
...@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() { ...@@ -72,48 +71,65 @@ MaceContext& GetMaceContext() {
} // namespace } // namespace
JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs( JNIEXPORT jint JNICALL
JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) { JNIEnv *env, jclass thisObj, jstring storage_path) {
MaceContext &mace_context = GetMaceContext(); MaceContext &mace_context = GetMaceContext();
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr);
if (storage_path_ptr == nullptr) return JNI_ERR;
const std::string storage_file_path(storage_path_ptr);
env->ReleaseStringUTFChars(storage_path, storage_path_ptr);
mace::MaceStatus status; mace_context.gpu_context = mace::GPUContextBuilder()
// openmp .SetStoragePath(storage_file_path)
status = mace::SetOpenMPThreadPolicy( .Finalize();
omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"openmp result: %d, threads: %d, cpu: %d",
status, omp_num_threads, cpu_affinity_policy);
// gpu
mace::SetGPUHints(
static_cast<mace::GPUPerfHint>(gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"gpu perf: %d, priority: %d",
gpu_perf_hint, gpu_priority_hint);
// opencl cache
const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr);
if (kernel_path_ptr == nullptr) return JNI_ERR;
const std::string kernel_file_path(kernel_path_ptr);
mace_context.storage_factory.reset(
new mace::FileStorageFactory(kernel_file_path));
mace::SetKVStorageFactory(mace_context.storage_factory);
env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr);
return JNI_OK; return JNI_OK;
} }
JNIEXPORT jint JNICALL JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) { JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
jint gpu_perf_hint, jint gpu_priority_hint,
jstring model_name_str, jstring device) {
MaceContext &mace_context = GetMaceContext(); MaceContext &mace_context = GetMaceContext();
// get device
const char *device_ptr = env->GetStringUTFChars(device, nullptr);
if (device_ptr == nullptr) return JNI_ERR;
mace_context.device_type = ParseDeviceType(device_ptr);
env->ReleaseStringUTFChars(device, device_ptr);
// create MaceEngineConfig
mace::MaceStatus status;
mace::MaceEngineConfig config(mace_context.device_type);
status = config.SetCPUThreadPolicy(
omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
if (status != mace::MACE_SUCCESS) {
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"openmp result: %d, threads: %d, cpu: %d",
status, omp_num_threads, cpu_affinity_policy);
}
if (mace_context.device_type == mace::DeviceType::GPU) {
config.SetGPUContext(mace_context.gpu_context);
config.SetGPUHints(
static_cast<mace::GPUPerfHint>(gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(gpu_priority_hint));
__android_log_print(ANDROID_LOG_INFO,
"image_classify attrs",
"gpu perf: %d, priority: %d",
gpu_perf_hint, gpu_priority_hint);
}
__android_log_print(ANDROID_LOG_INFO,
"image_classify attrs",
"device: %d",
mace_context.device_type);
// parse model name // parse model name
const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr); const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr);
if (model_name_ptr == nullptr) return JNI_ERR; if (model_name_ptr == nullptr) return JNI_ERR;
...@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( ...@@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
std::vector<std::string> input_names = {model_info_iter->second.input_name}; std::vector<std::string> input_names = {model_info_iter->second.input_name};
std::vector<std::string> output_names = {model_info_iter->second.output_name}; std::vector<std::string> output_names = {model_info_iter->second.output_name};
// get device
const char *device_ptr = env->GetStringUTFChars(device, nullptr);
if (device_ptr == nullptr) return JNI_ERR;
mace_context.device_type = ParseDeviceType(device_ptr);
env->ReleaseStringUTFChars(device, device_ptr);
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"device: %d",
mace_context.device_type);
mace::MaceStatus create_engine_status = mace::MaceStatus create_engine_status =
CreateMaceEngineFromCode(mace_context.model_name, CreateMaceEngineFromCode(mace_context.model_name,
std::string(), std::string(),
input_names, input_names,
output_names, output_names,
mace_context.device_type, config,
&mace_context.engine); &mace_context.engine);
__android_log_print(ANDROID_LOG_ERROR, __android_log_print(ANDROID_LOG_INFO,
"image_classify attrs", "image_classify attrs",
"create result: %d", "create result: %d",
create_engine_status); create_engine_status);
......
...@@ -24,11 +24,13 @@ extern "C" { ...@@ -24,11 +24,13 @@ extern "C" {
#endif #endif
/* /*
* Class: com_xiaomi_mace_JniMaceUtils * Class: com_xiaomi_mace_JniMaceUtils
* Method: maceMobilenetSetAttrs * Method: maceMobilenetCreateGPUContext
* Signature: (Ljava/lang/String;IIIILjava/lang/String;)I * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I
*/ */
JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs JNIEXPORT jint JNICALL
(JNIEnv *, jclass, jint, jint, jint, jint, jstring); Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *,
jclass,
jstring);
/* /*
* Class: com_xiaomi_mace_JniMaceUtils * Class: com_xiaomi_mace_JniMaceUtils
...@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs ...@@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs
*/ */
JNIEXPORT jint JNICALL JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine
(JNIEnv *, jclass, jstring, jstring); (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring);
/* /*
* Class: com_xiaomi_mace_JniMaceUtils * Class: com_xiaomi_mace_JniMaceUtils
......
...@@ -20,9 +20,9 @@ public class JniMaceUtils { ...@@ -20,9 +20,9 @@ public class JniMaceUtils {
System.loadLibrary("mace_mobile_jni"); System.loadLibrary("mace_mobile_jni");
} }
public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath); public static native int maceMobilenetCreateGPUContext(String storagePath);
public static native int maceMobilenetCreateEngine(String model, String device); public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device);
public static native float[] maceMobilenetClassify(float[] input); public static native float[] maceMobilenetClassify(float[] input);
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
// if convert model to code. // if convert model to code.
#ifdef MODEL_GRAPH_FORMAT_CODE #ifdef MODEL_GRAPH_FORMAT_CODE
#include "mace/codegen/engine/mace_engine_factory.h" #include "mace/codegen/engine/mace_engine_factory.h"
...@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -157,40 +156,40 @@ bool RunModel(const std::vector<std::string> &input_names,
const std::vector<std::vector<int64_t>> &output_shapes) { const std::vector<std::vector<int64_t>> &output_shapes) {
// load model // load model
DeviceType device_type = ParseDeviceType(FLAGS_device); DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime // configuration
mace::SetOpenMPThreadPolicy( // Detailed information please see mace.h
MaceStatus status;
MaceEngineConfig config(device_type);
status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (status != MACE_SUCCESS) {
std::cerr << "Set openmp or cpu affinity failed." << std::endl;
}
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) { if (device_type == DeviceType::GPU) {
mace::SetGPUHints( // DO NOT USE tmp directory.
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint), // Please use APP's own directory and make sure the directory exists.
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string storage_path =
// Just call once. (Not thread-safe) std::string(storage_path_ptr == nullptr ?
// Set paths of Generated OpenCL Compiled Kernel Binary file "/data/local/tmp/mace_run/interior" : storage_path_ptr);
// if you build gpu library of specific soc.
// Using OpenCL binary will speed up the initialization.
// OpenCL binary is corresponding to the OpenCL Driver version,
// you should update the binary when OpenCL Driver changed.
std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file}; std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
mace::SetOpenCLBinaryPaths(opencl_binary_paths);
mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); gpu_context = GPUContextBuilder()
.SetStoragePath(storage_path)
.SetOpenCLBinaryPaths(opencl_binary_paths)
.SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
.Finalize();
config.SetGPUContext(gpu_context);
config.SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} }
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
// DO NOT USE tmp directory.
// Please use APP's own directory and make sure the directory exists.
// Just call once
const std::string internal_storage_path =
"/data/local/tmp/mace_run/interior";
// Config internal kv storage factory.
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(internal_storage_path));
SetKVStorageFactory(storage_factory);
// Create Engine // Create Engine
std::shared_ptr<mace::MaceEngine> engine; std::shared_ptr<mace::MaceEngine> engine;
MaceStatus create_engine_status; MaceStatus create_engine_status;
...@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -204,7 +203,7 @@ bool RunModel(const std::vector<std::string> &input_names,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#else #else
std::vector<unsigned char> model_pb_data; std::vector<unsigned char> model_pb_data;
...@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names, ...@@ -216,7 +215,7 @@ bool RunModel(const std::vector<std::string> &input_names,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#endif #endif
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -126,10 +127,14 @@ template <DeviceType D, typename T> ...@@ -126,10 +127,14 @@ template <DeviceType D, typename T>
class ActivationFunctor; class ActivationFunctor;
template <> template <>
class ActivationFunctor<DeviceType::CPU, float> { class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
public: public:
ActivationFunctor(ActivationType type, float relux_max_limit) ActivationFunctor(OpKernelContext *context,
: activation_(type), relux_max_limit_(relux_max_limit) {} ActivationType type,
float relux_max_limit)
: OpKernel(context),
activation_(type),
relux_max_limit_(relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
...@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> { ...@@ -159,10 +164,14 @@ class ActivationFunctor<DeviceType::CPU, float> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
class ActivationFunctor<DeviceType::GPU, T> { class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
public: public:
ActivationFunctor(ActivationType type, T relux_max_limit) ActivationFunctor(OpKernelContext *context,
: activation_(type), relux_max_limit_(static_cast<T>(relux_max_limit)) {} ActivationType type,
T relux_max_limit)
: OpKernel(context),
activation_(type),
relux_max_limit_(static_cast<T>(relux_max_limit)) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *alpha, const Tensor *alpha,
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -35,10 +36,11 @@ namespace kernels { ...@@ -35,10 +36,11 @@ namespace kernels {
constexpr int kCostPerGroup = 1024; constexpr int kCostPerGroup = 1024;
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct AddNFunctor { struct AddNFunctor : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors, MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0])); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
index_t size = output_tensor->size(); index_t size = output_tensor->size();
...@@ -95,7 +97,8 @@ struct AddNFunctor { ...@@ -95,7 +97,8 @@ struct AddNFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct AddNFunctor<DeviceType::GPU, T> { struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_tensors, MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future); StatsFuture *future);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -30,7 +31,8 @@ namespace mace { ...@@ -30,7 +31,8 @@ namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ArgMaxFunctor { struct ArgMaxFunctor : OpKernel {
explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *axis, const Tensor *axis,
Tensor *output, Tensor *output,
......
...@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) { ...@@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) {
index_t filter_size = 3 * 3 * in_channels * out_channels; index_t filter_size = 3 * 3 * in_channels * out_channels;
index_t output_size = batch * out_channels * out_height * out_width; index_t output_size = batch * out_channels * out_height * out_width;
Tensor input; Tensor input(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor filter; Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor output; Tensor output(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor output_ref; Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT);
input.Resize({batch, in_channels, in_height, in_width}); input.Resize({batch, in_channels, in_height, in_width});
filter.Resize({out_channels, in_channels, 3, 3}); filter.Resize({out_channels, in_channels, 3, 3});
......
...@@ -33,11 +33,13 @@ ...@@ -33,11 +33,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BatchNormFunctorBase { struct BatchNormFunctorBase : OpKernel {
BatchNormFunctorBase(bool folded_constant, BatchNormFunctorBase(OpKernelContext *context,
bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: folded_constant_(folded_constant), : OpKernel(context),
folded_constant_(folded_constant),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit) {} relux_max_limit_(relux_max_limit) {}
...@@ -51,10 +53,14 @@ struct BatchNormFunctor; ...@@ -51,10 +53,14 @@ struct BatchNormFunctor;
template<> template<>
struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase { struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} : BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
...@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase { ...@@ -132,10 +138,14 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase { struct BatchNormFunctor<DeviceType::GPU, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, BatchNormFunctor(OpKernelContext *context,
const bool folded_constant,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} : BatchNormFunctorBase(context,
folded_constant,
activation,
relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *scale, const Tensor *scale,
const Tensor *offset, const Tensor *offset,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -30,10 +31,10 @@ ...@@ -30,10 +31,10 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BiasAddFunctorBase { struct BiasAddFunctorBase : OpKernel {
explicit BiasAddFunctorBase(const DataFormat data_format) { BiasAddFunctorBase(OpKernelContext *context,
data_format_ = data_format; const DataFormat data_format)
} : OpKernel(context), data_format_(data_format) {}
DataFormat data_format_; DataFormat data_format_;
}; };
...@@ -43,8 +44,9 @@ struct BiasAddFunctor; ...@@ -43,8 +44,9 @@ struct BiasAddFunctor;
template <> template <>
struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase { struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
explicit BiasAddFunctor(const DataFormat data_format) BiasAddFunctor(OpKernelContext *context,
: BiasAddFunctorBase(data_format) {} const DataFormat data_format)
: BiasAddFunctorBase(context, data_format) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
...@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase { ...@@ -96,8 +98,8 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase { struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
explicit BiasAddFunctor(const DataFormat data_format) BiasAddFunctor(OpKernelContext *context, const DataFormat data_format)
: BiasAddFunctorBase(data_format) {} : BiasAddFunctorBase(context, data_format) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
......
...@@ -20,21 +20,24 @@ ...@@ -20,21 +20,24 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/opencl/common.h" #include "mace/kernels/opencl/common.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase { struct BufferToImageFunctorBase : OpKernel {
explicit BufferToImageFunctorBase(const int wino_blk_size) explicit BufferToImageFunctorBase(OpKernelContext *context,
: wino_blk_size_(wino_blk_size) {} const int wino_blk_size)
: OpKernel(context), wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_; const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase { struct BufferToImageFunctor : BufferToImageFunctorBase {
explicit BufferToImageFunctor(const int wino_blk_size) explicit BufferToImageFunctor(OpKernelContext *context,
: BufferToImageFunctorBase(wino_blk_size) {} const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { ...@@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase { struct BufferToImageFunctor<DeviceType::GPU, T> : BufferToImageFunctorBase {
explicit BufferToImageFunctor(const int wino_blk_size) explicit BufferToImageFunctor(OpKernelContext *context,
: BufferToImageFunctorBase(wino_blk_size) {} const int wino_blk_size)
: BufferToImageFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
...@@ -20,13 +20,15 @@ ...@@ -20,13 +20,15 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct ChannelShuffleFunctor { struct ChannelShuffleFunctor : OpKernel {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} ChannelShuffleFunctor(OpKernelContext *context, const int groups)
: OpKernel(context), groups_(groups) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor { ...@@ -70,8 +72,9 @@ struct ChannelShuffleFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct ChannelShuffleFunctor<DeviceType::GPU, T> { struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} ChannelShuffleFunctor(OpKernelContext *context, const int groups)
: OpKernel(context), groups_(groups) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -30,15 +31,17 @@ ...@@ -30,15 +31,17 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ConcatFunctorBase { struct ConcatFunctorBase : OpKernel {
explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {} ConcatFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_; int32_t axis_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase { struct ConcatFunctor : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase { struct ConcatFunctor<DeviceType::GPU, T> : ConcatFunctorBase {
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} ConcatFunctor(OpKernelContext *context, const int32_t axis)
: ConcatFunctorBase(context, axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
......
...@@ -42,14 +42,16 @@ ...@@ -42,14 +42,16 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct Conv2dFunctorBase { struct Conv2dFunctorBase : OpKernel {
Conv2dFunctorBase(const int *strides, Conv2dFunctorBase(OpKernelContext *context,
const int *strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: strides_(strides), : OpKernel(context),
strides_(strides),
padding_type_(padding_type), padding_type_(padding_type),
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
...@@ -69,7 +71,8 @@ struct Conv2dFunctor; ...@@ -69,7 +71,8 @@ struct Conv2dFunctor;
template<> template<>
struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides, Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
...@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -77,12 +80,14 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
activation, activation,
relux_max_limit), relux_max_limit),
transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT),
is_filter_transformed_(is_filter_transformed), is_filter_transformed_(is_filter_transformed),
scratch_(scratch) {} scratch_(scratch) {}
...@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -721,7 +726,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
template<> template<>
struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides, Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
...@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -729,7 +735,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
...@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -949,7 +956,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides, Conv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
...@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { ...@@ -957,7 +965,8 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
const float relux_max_limit, const float relux_max_limit,
const bool is_filter_transformed, const bool is_filter_transformed,
ScratchBuffer *scratch) ScratchBuffer *scratch)
: Conv2dFunctorBase(strides, : Conv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
...@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase { ...@@ -968,10 +977,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
} }
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -30,10 +31,12 @@ ...@@ -30,10 +31,12 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct CropFunctorBase { struct CropFunctorBase : OpKernel {
CropFunctorBase(const int axis, CropFunctorBase(OpKernelContext *context,
const int axis,
const std::vector<int> &offset) const std::vector<int> &offset)
: axis_(axis), : OpKernel(context),
axis_(axis),
offset_(offset) {} offset_(offset) {}
const int axis_; const int axis_;
...@@ -42,8 +45,10 @@ struct CropFunctorBase { ...@@ -42,8 +45,10 @@ struct CropFunctorBase {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct CropFunctor : CropFunctorBase { struct CropFunctor : CropFunctorBase {
CropFunctor(const int axis, const std::vector<int> &offset) CropFunctor(OpKernelContext *context,
: CropFunctorBase(axis, offset) {} const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
void crop_copy(const T* input_data, T* output_data, void crop_copy(const T* input_data, T* output_data,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
...@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase { ...@@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase { struct CropFunctor<DeviceType::GPU, T> : CropFunctorBase {
CropFunctor(const int axis, const std::vector<int> &offset) CropFunctor(OpKernelContext *context,
: CropFunctorBase(axis, offset) {} const int axis,
const std::vector<int> &offset)
: CropFunctorBase(context, axis, offset) {}
MaceStatus operator()(const std::vector<const Tensor *> &input_list, MaceStatus operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_; std::unique_ptr<BufferBase> kernel_error_;
......
...@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input, ...@@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input,
} }
} // namespace deconv } // namespace deconv
struct Deconv2dFunctorBase { struct Deconv2dFunctorBase : OpKernel {
Deconv2dFunctorBase(const std::vector<int> &strides, Deconv2dFunctorBase(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: strides_(strides), : OpKernel(context),
strides_(strides),
padding_type_(padding_type), padding_type_(padding_type),
paddings_(paddings), paddings_(paddings),
output_shape_(output_shape), output_shape_(output_shape),
...@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase { ...@@ -210,13 +212,15 @@ struct Deconv2dFunctorBase {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct Deconv2dFunctor : Deconv2dFunctorBase { struct Deconv2dFunctor : Deconv2dFunctorBase {
Deconv2dFunctor(const std::vector<int> &strides, Deconv2dFunctor(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: Deconv2dFunctorBase(strides, : Deconv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
output_shape, output_shape,
...@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase { struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
Deconv2dFunctor(const std::vector<int> &strides, Deconv2dFunctor(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type, const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const std::vector<index_t> &output_shape, const std::vector<index_t> &output_shape,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: Deconv2dFunctorBase(strides, : Deconv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
output_shape, output_shape,
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -29,9 +30,11 @@ namespace mace { ...@@ -29,9 +30,11 @@ namespace mace {
namespace kernels { namespace kernels {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct DepthToSpaceOpFunctor { struct DepthToSpaceOpFunctor : OpKernel {
explicit DepthToSpaceOpFunctor(const int block_size, bool d2s) DepthToSpaceOpFunctor(OpKernelContext *context,
: block_size_(block_size), d2s_(d2s) {} const int block_size,
bool d2s)
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor { ...@@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct DepthToSpaceOpFunctor<DeviceType::GPU, T> { struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
DepthToSpaceOpFunctor(const int block_size, bool d2s) DepthToSpaceOpFunctor(OpKernelContext *context,
: block_size_(block_size), d2s_(d2s) {} const int block_size,
bool d2s)
: OpKernel(context), block_size_(block_size), d2s_(d2s) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
......
...@@ -37,14 +37,16 @@ ...@@ -37,14 +37,16 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct DepthwiseConv2dFunctorBase { struct DepthwiseConv2dFunctorBase : OpKernel {
DepthwiseConv2dFunctorBase(const int *strides, DepthwiseConv2dFunctorBase(OpKernelContext *context,
const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: strides_(strides), : OpKernel(context),
strides_(strides),
padding_type_(padding_type), padding_type_(padding_type),
paddings_(paddings), paddings_(paddings),
dilations_(dilations), dilations_(dilations),
...@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor; ...@@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor;
template<> template<>
struct DepthwiseConv2dFunctor<DeviceType::CPU, float> struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
: public DepthwiseConv2dFunctorBase { : public DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides, DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides, : DepthwiseConv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
...@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
template<> template<>
struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t> struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
: public DepthwiseConv2dFunctorBase { : public DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides, DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides, : DepthwiseConv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
...@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t> ...@@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
const int32_t *bias_data = nullptr; const int32_t *bias_data = nullptr;
if (bias == nullptr) { if (bias == nullptr) {
zero_bias.reset( zero_bias.reset(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); new Tensor(GetCPUAllocator(), DT_INT32));
zero_bias->Resize(bias_shape); zero_bias->Resize(bias_shape);
zero_bias->Clear(); zero_bias->Clear();
bias_data = zero_bias->data<int32_t>(); bias_data = zero_bias->data<int32_t>();
...@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t> ...@@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
template<typename T> template<typename T>
struct DepthwiseConv2dFunctor<DeviceType::GPU, T> struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
: DepthwiseConv2dFunctorBase { : DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides, DepthwiseConv2dFunctor(OpKernelContext *context,
const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations, const int *dilations,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides, : DepthwiseConv2dFunctorBase(context,
strides,
padding_type, padding_type,
paddings, paddings,
dilations, dilations,
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
} }
struct EltwiseFunctorBase { struct EltwiseFunctorBase : OpKernel {
EltwiseFunctorBase(const EltwiseType type, EltwiseFunctorBase(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, const float scalar_input,
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format)
: type_(type), : OpKernel(context),
type_(type),
coeff_(coeff), coeff_(coeff),
scalar_input_(scalar_input), scalar_input_(scalar_input),
scalar_input_index_(scalar_input_index), scalar_input_index_(scalar_input_index),
...@@ -823,12 +826,14 @@ struct EltwiseFunctorBase { ...@@ -823,12 +826,14 @@ struct EltwiseFunctorBase {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct EltwiseFunctor : EltwiseFunctorBase { struct EltwiseFunctor : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, // float as it comes from arg const float scalar_input, // float as it comes from arg
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format)
: EltwiseFunctorBase(type, : EltwiseFunctorBase(context,
type,
coeff, coeff,
scalar_input, scalar_input,
scalar_input_index, scalar_input_index,
...@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase { ...@@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(OpKernelContext *context,
const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float scalar_input, const float scalar_input,
const int32_t scalar_input_index, const int32_t scalar_input_index,
const DataFormat data_format) const DataFormat data_format)
: EltwiseFunctorBase(type, : EltwiseFunctorBase(context,
type,
coeff, coeff,
scalar_input, scalar_input,
scalar_input_index, scalar_input_index,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -30,8 +31,8 @@ template <DeviceType D, class T> ...@@ -30,8 +31,8 @@ template <DeviceType D, class T>
struct FillFunctor; struct FillFunctor;
template <> template <>
struct FillFunctor<DeviceType::CPU, float> { struct FillFunctor<DeviceType::CPU, float> : OpKernel {
FillFunctor() {} explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *shape, MaceStatus operator()(const Tensor *shape,
const Tensor *value, const Tensor *value,
......
...@@ -27,10 +27,12 @@ ...@@ -27,10 +27,12 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct FullyConnectedBase { struct FullyConnectedBase : OpKernel {
FullyConnectedBase(const ActivationType activation, FullyConnectedBase(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: activation_(activation), : OpKernel(context),
activation_(activation),
relux_max_limit_(relux_max_limit) {} relux_max_limit_(relux_max_limit) {}
const ActivationType activation_; const ActivationType activation_;
...@@ -42,9 +44,10 @@ struct FullyConnectedFunctor; ...@@ -42,9 +44,10 @@ struct FullyConnectedFunctor;
template <> template <>
struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
...@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase { ...@@ -86,9 +89,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
template <> template <>
struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
...@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { ...@@ -117,7 +121,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
const int32_t *bias_ptr = nullptr; const int32_t *bias_ptr = nullptr;
if (bias == nullptr) { if (bias == nullptr) {
zero_bias.reset( zero_bias.reset(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); new Tensor(GetCPUAllocator(), DT_INT32));
zero_bias->Resize(bias_shape); zero_bias->Resize(bias_shape);
zero_bias->Clear(); zero_bias->Clear();
bias_ptr = zero_bias->data<int32_t>(); bias_ptr = zero_bias->data<int32_t>();
...@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { ...@@ -148,9 +152,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase { struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation, FullyConnectedFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit) const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {} : FullyConnectedBase(context, activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *weight, const Tensor *weight,
......
...@@ -21,13 +21,15 @@ ...@@ -21,13 +21,15 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct GatherBase { struct GatherBase : OpKernel {
explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {} GatherBase(OpKernelContext *context, int axis, float y)
: OpKernel(context), axis_(axis), y_(y) {}
int axis_; int axis_;
float y_; float y_;
...@@ -38,7 +40,8 @@ struct GatherFunctor; ...@@ -38,7 +40,8 @@ struct GatherFunctor;
template <> template <>
struct GatherFunctor<DeviceType::CPU, float> : GatherBase { struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {} GatherFunctor(OpKernelContext *context, int axis, float y)
: GatherBase(context, axis, y) {}
MaceStatus operator()(const Tensor *params, MaceStatus operator()(const Tensor *params,
const Tensor *indices, const Tensor *indices,
......
...@@ -1341,8 +1341,8 @@ void Gemm(const float *A, ...@@ -1341,8 +1341,8 @@ void Gemm(const float *A,
ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k); ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k);
const index_t ik_end = std::min(K, ik_begin + this_block_size_k); const index_t ik_end = std::min(K, ik_begin + this_block_size_k);
Tensor trans_a; Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor trans_b; Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
const float *real_a = nullptr; const float *real_a = nullptr;
const float *real_b = nullptr; const float *real_b = nullptr;
float *real_c = c_base + (ih_begin * width + iw_begin); float *real_c = c_base + (ih_begin * width + iw_begin);
...@@ -1399,8 +1399,8 @@ void GemmRef(const float *A, ...@@ -1399,8 +1399,8 @@ void GemmRef(const float *A,
const bool transpose_b) { const bool transpose_b) {
memset(C, 0, sizeof(float) * batch * height * width); memset(C, 0, sizeof(float) * batch * height * width);
Tensor trans_a; Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT);
Tensor trans_b; Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT);
float *trans_a_data = nullptr; float *trans_a_data = nullptr;
float *trans_b_data = nullptr; float *trans_b_data = nullptr;
if (transpose_a) { if (transpose_a) {
......
...@@ -20,21 +20,24 @@ ...@@ -20,21 +20,24 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/opencl/common.h" #include "mace/kernels/opencl/common.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ImageToBufferFunctorBase { struct ImageToBufferFunctorBase : OpKernel {
explicit ImageToBufferFunctorBase(const int wino_blk_size) ImageToBufferFunctorBase(OpKernelContext *context,
: wino_blk_size_(wino_blk_size) {} const int wino_blk_size)
: OpKernel(context),
wino_blk_size_(wino_blk_size) {}
const int wino_blk_size_; const int wino_blk_size_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ImageToBufferFunctor : ImageToBufferFunctorBase { struct ImageToBufferFunctor : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size) ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size)
: ImageToBufferFunctorBase(wino_blk_size) {} : ImageToBufferFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { ...@@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
template <typename T> template <typename T>
struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase { struct ImageToBufferFunctor<DeviceType::GPU, T> : ImageToBufferFunctorBase {
explicit ImageToBufferFunctor(const int wino_blk_size) ImageToBufferFunctor(OpKernelContext *context,
: ImageToBufferFunctorBase(wino_blk_size) {} const int wino_blk_size)
: ImageToBufferFunctorBase(context, wino_blk_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_KERNEL_H_
#define MACE_KERNELS_KERNEL_H_
#include "mace/core/op_kernel_context.h"
namespace mace {
namespace kernels {
struct OpKernel {
explicit OpKernel(OpKernelContext *context): context_(context) {}
OpKernelContext *context_;
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_KERNEL_H_
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -34,7 +34,9 @@ template<DeviceType D, typename T> ...@@ -34,7 +34,9 @@ template<DeviceType D, typename T>
struct LocalResponseNormFunctor; struct LocalResponseNormFunctor;
template<> template<>
struct LocalResponseNormFunctor<DeviceType::CPU, float> { struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
explicit LocalResponseNormFunctor(OpKernelContext *context)
: OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
int depth_radius, int depth_radius,
float bias, float bias,
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#include <arm_neon.h> #include <arm_neon.h>
...@@ -35,9 +36,10 @@ template <DeviceType D, typename T> ...@@ -35,9 +36,10 @@ template <DeviceType D, typename T>
struct LSTMCellFunctor; struct LSTMCellFunctor;
template <typename T> template <typename T>
struct LSTMCellFunctor<DeviceType::GPU, T> { struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
explicit LSTMCellFunctor(T forget_bias) : LSTMCellFunctor(OpKernelContext *context, T forget_bias)
forget_bias_(static_cast<T>(forget_bias)) {} : OpKernel(context),
forget_bias_(static_cast<T>(forget_bias)) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const Tensor *pre_output, const Tensor *pre_output,
const Tensor *weight, const Tensor *weight,
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/kernels/gemmlowp_util.h" #include "mace/kernels/gemmlowp_util.h"
...@@ -40,7 +41,8 @@ namespace mace { ...@@ -40,7 +41,8 @@ namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct MatMulFunctor { struct MatMulFunctor : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *A, MaceStatus operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
...@@ -87,7 +89,7 @@ struct MatMulFunctor { ...@@ -87,7 +89,7 @@ struct MatMulFunctor {
// A * B = (B^T * A^T)^T // A * B = (B^T * A^T)^T
if (!transpose_b) { if (!transpose_b) {
if (B_transpose_.get() == nullptr) { if (B_transpose_.get() == nullptr) {
B_transpose_.reset(new Tensor(GetDeviceAllocator(D), B_transpose_.reset(new Tensor(context_->device()->allocator(),
DataTypeToEnum<T>::v())); DataTypeToEnum<T>::v()));
B_transpose_->Resize({batch, width, K}); B_transpose_->Resize({batch, width, K});
Tensor::MappingGuard guardbt(B_transpose_.get()); Tensor::MappingGuard guardbt(B_transpose_.get());
...@@ -112,7 +114,8 @@ struct MatMulFunctor { ...@@ -112,7 +114,8 @@ struct MatMulFunctor {
}; };
template <> template <>
struct MatMulFunctor<CPU, uint8_t> { struct MatMulFunctor<CPU, uint8_t> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder> template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
void MatMulImpl(const Tensor *A, void MatMulImpl(const Tensor *A,
const Tensor *B, const Tensor *B,
...@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> { ...@@ -208,7 +211,8 @@ struct MatMulFunctor<CPU, uint8_t> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct MatMulFunctor<DeviceType::GPU, T> { struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *A, MaceStatus operator()(const Tensor *A,
const Tensor *B, const Tensor *B,
Tensor *C, Tensor *C,
......
...@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -33,11 +33,11 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
built_options.emplace("-Dactivation=" + kernel_name); built_options.emplace("-Dactivation=" + kernel_name);
...@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()( ...@@ -94,12 +94,12 @@ MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -34,7 +34,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
const index_t width = input_tensors[0]->dim(2); const index_t width = input_tensors[0]->dim(2);
const index_t channels = input_tensors[0]->dim(3); const index_t channels = input_tensors[0]->dim(3);
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
for (size_t i = 1; i < size; ++i) { for (size_t i = 1; i < size; ++i) {
MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK_NOTNULL(input_tensors[i]);
...@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -49,7 +49,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
...@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()( ...@@ -96,7 +96,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std::string tuning_key = std::string tuning_key =
Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
output_tensor->dim(2), output_tensor->dim(3)); output_tensor->dim(2), output_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
...@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()( ...@@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel", activation_, output->dim(0), Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-Dbias_add=" + kernel_name);
...@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event; cl::Event event;
cl_int error; cl_int error;
......
...@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
} }
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......
...@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
built_options.emplace("-Dchannel_shuffle=" + kernel_name); built_options.emplace("-Dchannel_shuffle=" + kernel_name);
...@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()( ...@@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -22,13 +22,15 @@ namespace mace { ...@@ -22,13 +22,15 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
...@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -41,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace } // namespace
static MaceStatus Concat2(cl::Kernel *kernel, static MaceStatus Concat2(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt, const DataType dt,
...@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel,
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
built_options.emplace("-Dconcat_channel=" + kernel_name); built_options.emplace("-Dconcat_channel=" + kernel_name);
...@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel, ...@@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel,
*prev_input_shape = input0->shape(); *prev_input_shape = input0->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("concat_opencl_kernel", output->dim(0), output->dim(1), Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS; return MACE_SUCCESS;
} }
static MaceStatus ConcatN(cl::Kernel *kernel, static MaceStatus ConcatN(OpKernelContext *context,
cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list, const std::vector<const Tensor *> &input_list,
const DataType dt, const DataType dt,
Tensor *output, Tensor *output,
...@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
const index_t height = output->dim(1); const index_t height = output->dim(1);
const index_t width = output->dim(2); const index_t width = output->dim(2);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
...@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width), static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
uint32_t idx = 0; uint32_t idx = 0;
OUT_OF_RANGE_SET_ARG_PTR; OUT_OF_RANGE_SET_ARG_PTR;
...@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
for (size_t j = 0; j < 3; ++j) { for (size_t j = 0; j < 3; ++j) {
roundup_gws[j] = RoundUp(gws[j], lws[j]); roundup_gws[j] = RoundUp(gws[j], lws[j]);
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, *kernel, cl::NullRange,
cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
...@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, ...@@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel,
} }
} }
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [runtime, call_stats](CallStats *stats) { future->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) { if (stats != nullptr) {
stats->start_micros = call_stats.start_micros; stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros; stats->end_micros = stats->start_micros + call_stats.end_micros;
...@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()( ...@@ -234,12 +236,14 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
return Concat2(&kernel_, input_list[0], input_list[1], return Concat2(context_,
&kernel_, input_list[0], input_list[1],
DataTypeToEnum<T>::value, &input_shape_, output, future, DataTypeToEnum<T>::value, &input_shape_, output, future,
&kwg_size_, &kernel_error_); &kwg_size_, &kernel_error_);
default: default:
if (divisible_four) { if (divisible_four) {
return ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, return ConcatN(context_,
&kernel_, input_list, DataTypeToEnum<T>::value, output,
future, &kwg_size_, &kernel_error_); future, &kwg_size_, &kernel_error_);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
......
...@@ -18,7 +18,8 @@ ...@@ -18,7 +18,8 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t *kwg_size, uint32_t *kwg_size,
std::unique_ptr<BufferBase> *kernel_error); std::unique_ptr<BufferBase> *kernel_error);
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(OpKernelContext *runtime,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef MaceStatus (*Conv2dOpenclFunction)( typedef MaceStatus (*Conv2dOpenclFunction)(
cl::Kernel * kernel, const Tensor *input, const Tensor *filter, OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input,
const Tensor *bias, const int stride, const int *padding, const Tensor *filter, const Tensor *bias, const int stride,
const int *dilations, const ActivationType activation, const int *padding, const int *dilations,
const ActivationType activation,
const float relux_max_limit, const DataType dt, const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future, std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error); uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
...@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if (kernel_h == kernel_w && kernel_h <= 3 && if (kernel_h == kernel_w && kernel_h <= 3 &&
selector[kernel_h - 1] != nullptr) { selector[kernel_h - 1] != nullptr) {
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
return conv2d_func( return conv2d_func(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
} else { } else {
return Conv2dOpencl( return Conv2dOpencl(context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
......
...@@ -25,14 +25,16 @@ namespace { ...@@ -25,14 +25,16 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value. // TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 128; const uint32_t lws_limit = 128;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = runtime->device_compute_units();
const uint32_t base = const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -62,7 +64,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace } // namespace
extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
MACE_CHECK(input_batch == batch); MACE_CHECK(input_batch == batch);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
...@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -24,15 +24,17 @@ namespace kernels { ...@@ -24,15 +24,17 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = std::max<uint32_t>( uint32_t compute_units = std::max<uint32_t>(
OpenCLRuntime::Global()->device_compute_units() / 2, 1); runtime->device_compute_units() / 2, 1);
const uint32_t base = const uint32_t base =
std::max<uint32_t>( std::max<uint32_t>(
std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1); std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
...@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -55,7 +57,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace } // namespace
extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv<index_t, 5>(width); const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
...@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -26,7 +26,8 @@ namespace { ...@@ -26,7 +26,8 @@ namespace {
const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
// TODO(liuqi): Fix the specific value. // TODO(liuqi): Fix the specific value.
const uint32_t lws_limit = 20; const uint32_t lws_limit = 20;
std::vector<uint32_t> LocalWS(const uint32_t *gws, std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kernel_size, const uint32_t kernel_size,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
...@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -34,8 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t compute_units = runtime->device_compute_units();
const uint32_t base = const uint32_t base =
std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
...@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, ...@@ -64,7 +65,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
} // namespace } // namespace
extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, extern MaceStatus Conv2dOpencl(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-Dconv_2d=" + kernel_name);
...@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, ...@@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel,
Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1), Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3), filter->dim(2), filter->dim(3)); output->dim(2), output->dim(3), filter->dim(2), filter->dim(3));
std::vector<uint32_t> lws = std::vector<uint32_t> lws =
LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
......
...@@ -22,13 +22,15 @@ namespace mace { ...@@ -22,13 +22,15 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]); lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
...@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -132,11 +134,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(output->dim(0) * output->dim(1)) static_cast<uint32_t>(output->dim(0) * output->dim(1))
}; };
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop");
built_options.emplace("-Dcrop=" + kernel_name); built_options.emplace("-Dcrop=" + kernel_name);
...@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()( ...@@ -167,11 +169,11 @@ MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input0->shape(); input_shape_ = input0->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("crop_opencl_kernel", output->dim(0), output->dim(1), Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -20,7 +20,8 @@ namespace kernels { ...@@ -20,7 +20,8 @@ namespace kernels {
namespace { namespace {
MaceStatus Deconv2dOpencl(cl::Kernel *kernel, MaceStatus Deconv2dOpencl(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
const int align_w = stride_w - 1 - padding_w; const int align_w = stride_w - 1 - padding_w;
const int kernel_size = filter->dim(2) * filter->dim(3); const int kernel_size = filter->dim(2) * filter->dim(3);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d");
built_options.emplace("-Ddeconv_2d=" + kernel_name); built_options.emplace("-Ddeconv_2d=" + kernel_name);
...@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, ...@@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("deconv2d_opencl_kernel_", activation, output->dim(0), Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
...@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
&output_image_shape); &output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(), return Deconv2dOpencl(context_, &kernel_, input, filter, bias,
paddings.data(), activation_, relux_max_limit_, strides_.data(), paddings.data(), activation_,
DataTypeToEnum<T>::value, &input_shape_, output, future, relux_max_limit_, DataTypeToEnum<T>::value,
&input_shape_, output, future,
&kwg_size_, &kernel_error_); &kwg_size_, &kernel_error_);
} }
......
...@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape));
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()( ...@@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -24,13 +24,15 @@ namespace kernels { ...@@ -24,13 +24,15 @@ namespace kernels {
namespace { namespace {
// (inputs + weights + outputs) * array_size * sizeof(float) // (inputs + weights + outputs) * array_size * sizeof(float)
const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = cache_size / kBaseGPUMemCacheSize; uint32_t base = cache_size / kBaseGPUMemCacheSize;
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { ...@@ -58,7 +60,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
} // namespace } // namespace
static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, static MaceStatus DepthwiseConv2d(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, // NHWC const Tensor *input, // NHWC
const Tensor *filter, // HWIM const Tensor *filter, // HWIM
const Tensor *bias, const Tensor *bias,
...@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
...@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, ...@@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel,
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size); const std::vector<uint32_t> lws = LocalWS(runtime, gws, *kwg_size);
std::string tuning_key = std::string tuning_key =
Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
...@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) { if (strides_[0] != strides_[1]) {
LOG(WARNING) << "OpenCL depthwise conv2d kernel with " LOG(FATAL) << "GPU depthwise conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet.";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
return DepthwiseConv2dFunctor<DeviceType::CPU, float>(
strides_, padding_type_, paddings_, dilations_, activation_,
relux_max_limit_)(input, filter, bias, output, future);
} }
// Create a fake conv_2d filter to calculate the paddings and output size // Create a fake conv_2d filter to calculate the paddings and output size
...@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()( ...@@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return DepthwiseConv2d( return DepthwiseConv2d(
context_,
&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_, activation_, relux_max_limit_, DataTypeToEnum<T>::value, &input_shape_,
output, future, &kwg_size_, &kernel_error_); output, future, &kwg_size_, &kernel_error_);
......
...@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch_height_pixels)}; static_cast<uint32_t>(batch_height_pixels)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
...@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0, ...@@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
input_shape_ = input0->shape(); input_shape_ = input0->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -22,7 +22,8 @@ namespace kernels { ...@@ -22,7 +22,8 @@ namespace kernels {
namespace { namespace {
template <typename T> template <typename T>
MaceStatus FCWXKernel(cl::Kernel *kernel, MaceStatus FCWXKernel(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
...@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size); const index_t output_blocks = RoundUpDiv4(output_size);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width");
...@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, ...@@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel,
} }
template <typename T> template <typename T>
MaceStatus FCWTXKernel(cl::Kernel *kernel, MaceStatus FCWTXKernel(OpKernelContext *context,
cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
...@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::unique_ptr<BufferBase> *kernel_error) { std::unique_ptr<BufferBase> *kernel_error) {
MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws); MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(*kernel_error); OUT_OF_RANGE_CONFIG(*kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
...@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, ...@@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel,
std::string tuning_key = std::string tuning_key =
Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2),
output->dim(3)); output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
gws->data(), *lws, future)); gws->data(), *lws, future));
OUT_OF_RANGE_VALIDATION(*kernel_error); OUT_OF_RANGE_VALIDATION(*kernel_error);
...@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()( ...@@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
&output_image_shape); &output_image_shape);
MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
return FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, return FCWXKernel<T>(context_,
&kernel_, input, weight, bias, &input_shape_, output,
activation_, &gws_, &lws_, relux_max_limit_, future, activation_, &gws_, &lws_, relux_max_limit_, future,
&kernel_error_); &kernel_error_);
} }
......
...@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) { ...@@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) {
} }
} }
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) { const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t cache_size = uint64_t cache_size = runtime->device_global_mem_cache_size();
OpenCLRuntime::Global()->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = lws[2] =
...@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, ...@@ -245,13 +245,12 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
return lws; return lws;
} }
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
...@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, ...@@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel,
} }
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(runtime, &event);
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>( cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer); tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err); MACE_CL_RET_STATUS(err);
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) { future->wait_fn = [runtime, event](CallStats *stats) {
event.wait(); event.wait();
if (stats != nullptr) { if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats); runtime->GetCallStats(event, stats);
} }
}; };
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
const uint32_t kwg_size = const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel)); static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
...@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, ...@@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel,
} }
return error; return error;
}; };
OpenCLProfilingTimer timer(&event); OpenCLProfilingTimer timer(runtime, &event);
cl_int err = Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>( cl_int err = runtime->tuner()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer); tuning_key, lws, params_generator, func, &timer);
MACE_CL_RET_STATUS(err); MACE_CL_RET_STATUS(err);
......
...@@ -31,11 +31,11 @@ ...@@ -31,11 +31,11 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
#define OUT_OF_RANGE_CONFIG(kernel_error) \ #define OUT_OF_RANGE_CONFIG(kernel_error, context) \
if (runtime->IsOutOfRangeCheckEnabled()) { \ if (runtime->IsOutOfRangeCheckEnabled()) { \
built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ built_options.emplace("-DOUT_OF_RANGE_CHECK"); \
(kernel_error) = std::move(std::unique_ptr<Buffer>( \ (kernel_error) = std::move(std::unique_ptr<Buffer>( \
new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \ new Buffer((context)->device()->allocator()))); \
MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \
(kernel_error)->Map(nullptr); \ (kernel_error)->Map(nullptr); \
*((kernel_error)->mutable_data<char>()) = 0; \ *((kernel_error)->mutable_data<char>()) = 0; \
...@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt); ...@@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpCompatibleCLDt(const DataType dt); std::string DtToUpCompatibleCLDt(const DataType dt);
// Tuning or Run OpenCL kernel with 3D work group size // Tuning or Run OpenCL kernel with 3D work group size
MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
// Tuning or Run OpenCL kernel with 2D work group size // Tuning or Run OpenCL kernel with 2D work group size
MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
...@@ -162,7 +164,8 @@ std::string Concat(Args... args) { ...@@ -162,7 +164,8 @@ std::string Concat(Args... args) {
return ss.str(); return ss.str();
} }
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws, std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size); const uint32_t kwg_size);
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()( ...@@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
break; break;
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
......
...@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()( ...@@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
const index_t width = input->dim(1); const index_t width = input->dim(1);
const index_t width_blocks = width / 4; const index_t width_blocks = width / 4;
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell");
...@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()( ...@@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
std::string tuning_key = std::string tuning_key =
Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -53,11 +53,11 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
static_cast<uint32_t>(height_blocks * batch), static_cast<uint32_t>(height_blocks * batch),
}; };
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
...@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A, ...@@ -84,7 +84,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0}; const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/core/op_kernel_context.h"
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
...@@ -25,14 +27,15 @@ namespace mace { ...@@ -25,14 +27,15 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
bool BufferToImageOpImpl(Tensor *buffer, bool BufferToImageOpImpl(OpKernelContext *context,
Tensor *buffer,
Tensor *image, Tensor *image,
const std::vector<size_t> &image_shape) { const std::vector<size_t> &image_shape) {
std::unique_ptr<BufferBase> kernel_error; std::unique_ptr<BufferBase> kernel_error;
uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]), uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
static_cast<uint32_t>(image_shape[1])}; static_cast<uint32_t>(image_shape[1])};
auto runtime = OpenCLRuntime::Global(); auto runtime = context->device()->opencl_runtime();
std::string kernel_name = "in_out_buffer_to_image"; std::string kernel_name = "in_out_buffer_to_image";
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
...@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer, ...@@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
built_options.emplace(kernel_name_ss.str()); built_options.emplace(kernel_name_ss.str());
OUT_OF_RANGE_CONFIG(kernel_error); OUT_OF_RANGE_CONFIG(kernel_error, context);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
if (buffer->dtype() == image->dtype()) { if (buffer->dtype() == image->dtype()) {
built_options.emplace("-DDATA_TYPE=" + built_options.emplace("-DDATA_TYPE=" +
...@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) { ...@@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) {
index_t width = 7; index_t width = 7;
index_t channels = 11; index_t channels = 11;
std::vector<index_t> buffer_shape = {batch, height, width, channels}; GPUContext gpu_context;
std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
Workspace ws; Workspace ws;
OpKernelContext context(&ws, device.get());
std::vector<index_t> buffer_shape = {batch, height, width, channels};
Tensor *buffer = Tensor *buffer =
ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU), ws.CreateTensor("Buffer", device->allocator(),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
buffer->Resize(buffer_shape); buffer->Resize(buffer_shape);
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU), Tensor *image = ws.CreateTensor("Image", device->allocator(),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape)); ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));
std::vector<size_t> overflow_image_shape = image_shape; std::vector<size_t> overflow_image_shape = image_shape;
for (size_t i = 0; i < overflow_image_shape.size(); ++i) { for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
overflow_image_shape[i] += 1; overflow_image_shape[i] += 1;
} }
ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape)); ASSERT_TRUE(BufferToImageOpImpl(&context,
buffer,
image,
overflow_image_shape));
} }
} // namespace kernels } // namespace kernels
......
...@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -47,11 +47,11 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad");
built_options.emplace("-Dpad=" + kernel_name); built_options.emplace("-Dpad=" + kernel_name);
...@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -85,10 +85,10 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -23,13 +23,15 @@ namespace kernels { ...@@ -23,13 +23,15 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
lws[2] = lws[2] =
...@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -54,12 +56,12 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
built_options.emplace("-Dpooling=" + kernel_name); built_options.emplace("-Dpooling=" + kernel_name);
...@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -149,11 +151,11 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}; };
} }
const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_); const std::vector<uint32_t> lws = LocalWS(runtime, gws.data(), kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws.data(), lws, future)); gws.data(), lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels); const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t image_size = static_cast<uint32_t >(in_height * in_width); const uint32_t image_size = static_cast<uint32_t >(in_height * in_width);
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
std::vector<uint32_t> gws(3); std::vector<uint32_t> gws(3);
std::vector<uint32_t> lws(3); std::vector<uint32_t> lws(3);
std::vector<index_t> output_shape{batch, 1, 1, channels}; std::vector<index_t> output_shape{batch, 1, 1, channels};
...@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()( ...@@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean");
built_options.emplace("-Dreduce_mean=" + kernel_name); built_options.emplace("-Dreduce_mean=" + kernel_name);
......
...@@ -23,9 +23,11 @@ namespace mace { ...@@ -23,9 +23,11 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint64_t cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()( ...@@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(out_width), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
auto dt = DataTypeToEnum<T>::value;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache");
built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
...@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()( ...@@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -23,13 +23,15 @@ namespace mace { ...@@ -23,13 +23,15 @@ namespace mace {
namespace kernels { namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (lws[1] >= base) { if (lws[1] >= base) {
...@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(out_width), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
...@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()( ...@@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -24,13 +24,15 @@ namespace kernels { ...@@ -24,13 +24,15 @@ namespace kernels {
namespace { namespace {
std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size) {
std::vector<uint32_t> lws(4, 0); std::vector<uint32_t> lws(4, 0);
if (kwg_size == 0) { if (kwg_size == 0) {
lws[0] = lws[1] = lws[2] = 1; lws[0] = lws[1] = lws[2] = 1;
} else { } else {
uint64_t uint64_t
cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); cache_size = runtime->device_global_mem_cache_size();
uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1); uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
lws[1] = std::min<uint32_t>(gws[1], kwg_size); lws[1] = std::min<uint32_t>(gws[1], kwg_size);
if (gws[0] < base) { if (gws[0] < base) {
...@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
built_options.emplace("-Dsoftmax=" + kernel_name); built_options.emplace("-Dsoftmax=" + kernel_name);
...@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits, ...@@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
input_shape_ = logits->shape(); input_shape_ = logits->shape();
} }
std::vector<uint32_t> lws = LocalWS(gws, kwg_size_); std::vector<uint32_t> lws = LocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat("softmax_opencl_kernel", batch, height, width, channels); Concat("softmax_opencl_kernel", batch, height, width, channels);
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)), chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
...@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()( ...@@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
space_shape_ = space_tensor->shape(); space_shape_ = space_tensor->shape();
} }
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
std::string tuning_key = std::string tuning_key =
Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
batch_tensor->dim(2), batch_tensor->dim(3)); batch_tensor->dim(2), batch_tensor->dim(3));
MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()( ...@@ -40,11 +40,11 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
output_list[i]->ResizeImage(output_shape, image_shape)); output_list[i]->ResizeImage(output_shape, image_shape));
} }
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split");
built_options.emplace("-Dsplit=" + kernel_name); built_options.emplace("-Dsplit=" + kernel_name);
...@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()( ...@@ -66,7 +66,7 @@ MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
static_cast<uint32_t>(input->dim(0) * input->dim(1)), static_cast<uint32_t>(input->dim(0) * input->dim(1)),
}; };
const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_); const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
cl::Event event; cl::Event event;
CallStats call_stats{INT64_MAX, 0}; CallStats call_stats{INT64_MAX, 0};
for (size_t i = 0; i < outputs_count; ++i) { for (size_t i = 0; i < outputs_count; ++i) {
......
...@@ -24,12 +24,12 @@ namespace kernels { ...@@ -24,12 +24,12 @@ namespace kernels {
template <typename T> template <typename T>
MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name; std::string obfuscated_kernel_name;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) { if (wino_blk_size_ == 4) {
obfuscated_kernel_name = obfuscated_kernel_name =
...@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_tensor->dim(0), output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(1),
output_tensor->dim(2)); output_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
...@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const std::vector<const Tensor*> &inputs, const std::vector<const Tensor*> &inputs,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
auto runtime = OpenCLRuntime::Global(); auto runtime = context_->device()->opencl_runtime();
const Tensor *input_tensor = inputs[0]; const Tensor *input_tensor = inputs[0];
const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr;
...@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name; std::string obfuscated_kernel_name;
std::set<std::string> built_options; std::set<std::string> built_options;
OUT_OF_RANGE_CONFIG(kernel_error_); OUT_OF_RANGE_CONFIG(kernel_error_, context_);
NON_UNIFORM_WG_CONFIG; NON_UNIFORM_WG_CONFIG;
if (wino_blk_size_ == 4) { if (wino_blk_size_ == 4) {
obfuscated_kernel_name = obfuscated_kernel_name =
...@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()( ...@@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(1), output_tensor->dim(2),
output_tensor->dim(3), input_tensor->dim(2)); output_tensor->dim(3), input_tensor->dim(2));
MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
gws, lws, future)); gws, lws, future));
OUT_OF_RANGE_VALIDATION(kernel_error_); OUT_OF_RANGE_VALIDATION(kernel_error_);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -29,10 +30,13 @@ ...@@ -29,10 +30,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct PadFunctorBase { struct PadFunctorBase : OpKernel {
PadFunctorBase(const std::vector<int> &paddings, PadFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value) const float constant_value)
: paddings_(paddings), constant_value_(constant_value) {} : OpKernel(context),
paddings_(paddings),
constant_value_(constant_value) {}
std::vector<int> paddings_; std::vector<int> paddings_;
float constant_value_; float constant_value_;
...@@ -40,9 +44,10 @@ struct PadFunctorBase { ...@@ -40,9 +44,10 @@ struct PadFunctorBase {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct PadFunctor : public PadFunctorBase { struct PadFunctor : public PadFunctorBase {
PadFunctor(const std::vector<int> &paddings, PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value) const float constant_value)
: PadFunctorBase(paddings, constant_value) {} : PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase { ...@@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase { struct PadFunctor<DeviceType::GPU, T> : PadFunctorBase {
PadFunctor(const std::vector<int> &paddings, PadFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const float constant_value) const float constant_value)
: PadFunctorBase(paddings, constant_value) {} : PadFunctorBase(context, paddings, constant_value) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/kernel.h"
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#include <arm_neon.h> #include <arm_neon.h>
...@@ -41,14 +42,16 @@ enum PoolingType { ...@@ -41,14 +42,16 @@ enum PoolingType {
namespace kernels { namespace kernels {
struct PoolingFunctorBase { struct PoolingFunctorBase : OpKernel {
PoolingFunctorBase(const PoolingType pooling_type, PoolingFunctorBase(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels, const int *kernels,
const int *strides, const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: pooling_type_(pooling_type), : OpKernel(context),
pooling_type_(pooling_type),
kernels_(kernels), kernels_(kernels),
strides_(strides), strides_(strides),
padding_type_(padding_type), padding_type_(padding_type),
...@@ -68,14 +71,20 @@ struct PoolingFunctor; ...@@ -68,14 +71,20 @@ struct PoolingFunctor;
template <> template <>
struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels, const int *kernels,
const int *strides, const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase( : PoolingFunctorBase(context,
pooling_type, kernels, strides, padding_type, paddings, dilations) { pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
} }
void MaxPooling(const float *input, void MaxPooling(const float *input,
...@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -231,15 +240,20 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
template <> template <>
struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase { struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels, const int *kernels,
const int *strides, const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase( : PoolingFunctorBase(context,
pooling_type, kernels, strides, padding_type, paddings, dilations) { pooling_type,
} kernels,
strides,
padding_type,
paddings,
dilations) {}
void MaxPooling(const uint8_t *input, void MaxPooling(const uint8_t *input,
const index_t *in_shape, const index_t *in_shape,
...@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase { ...@@ -443,14 +457,20 @@ struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase { struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type, PoolingFunctor(OpKernelContext *context,
const PoolingType pooling_type,
const int *kernels, const int *kernels,
const int *strides, const int *strides,
const Padding padding_type, const Padding padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int *dilations) const int *dilations)
: PoolingFunctorBase( : PoolingFunctorBase(context,
pooling_type, kernels, strides, padding_type, paddings, dilations) { pooling_type,
kernels,
strides,
padding_type,
paddings,
dilations) {
} }
MaceStatus operator()(const Tensor *input_tensor, MaceStatus operator()(const Tensor *input_tensor,
Tensor *output_tensor, Tensor *output_tensor,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr, ...@@ -121,8 +122,9 @@ inline std::vector<int> nms(const float *bboxes_ptr,
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct ProposalFunctor { struct ProposalFunctor : OpKernel {
ProposalFunctor(const int min_size, ProposalFunctor(OpKernelContext *context,
const int min_size,
const float nms_thresh, const float nms_thresh,
const int pre_nms_top_n, const int pre_nms_top_n,
const int post_nms_top_n, const int post_nms_top_n,
...@@ -130,6 +132,7 @@ struct ProposalFunctor { ...@@ -130,6 +132,7 @@ struct ProposalFunctor {
const int base_size, const int base_size,
const std::vector<int> &scales, const std::vector<int> &scales,
const std::vector<float> &ratios) : const std::vector<float> &ratios) :
OpKernel(context),
min_size_(min_size), min_size_(min_size),
thresh_(nms_thresh), thresh_(nms_thresh),
pre_nms_top_n_(pre_nms_top_n), pre_nms_top_n_(pre_nms_top_n),
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -173,8 +174,8 @@ template<DeviceType D, typename T> ...@@ -173,8 +174,8 @@ template<DeviceType D, typename T>
struct QuantizeFunctor; struct QuantizeFunctor;
template<> template<>
struct QuantizeFunctor<CPU, uint8_t> { struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
QuantizeFunctor() {} explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const bool non_zero, const bool non_zero,
...@@ -212,8 +213,8 @@ template<DeviceType D, typename T> ...@@ -212,8 +213,8 @@ template<DeviceType D, typename T>
struct DequantizeFunctor; struct DequantizeFunctor;
template<> template<>
struct DequantizeFunctor<CPU, uint8_t> { struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
DequantizeFunctor() {} explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#endif #endif
...@@ -31,10 +32,12 @@ ...@@ -31,10 +32,12 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct ReduceFunctorBase { struct ReduceFunctorBase : OpKernel {
ReduceFunctorBase(const std::vector<int> &axis, ReduceFunctorBase(OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims) const bool keep_dims)
: keep_dims_(keep_dims), : OpKernel(context),
keep_dims_(keep_dims),
axis_(axis) {} axis_(axis) {}
bool keep_dims_; bool keep_dims_;
bool reduce_first_axis_; bool reduce_first_axis_;
...@@ -44,10 +47,11 @@ struct ReduceFunctorBase { ...@@ -44,10 +47,11 @@ struct ReduceFunctorBase {
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ReduceMeanFunctor : ReduceFunctorBase{ struct ReduceMeanFunctor : ReduceFunctorBase {
ReduceMeanFunctor(const std::vector<int> &axis, ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> &axis,
const bool keep_dims) const bool keep_dims)
: ReduceFunctorBase(axis, keep_dims) {} : ReduceFunctorBase(context, axis, keep_dims) {}
void Simplify(const Tensor *input) { void Simplify(const Tensor *input) {
std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false); std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
...@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{ ...@@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{
template <typename T> template <typename T>
struct ReduceMeanFunctor<DeviceType::GPU, T> struct ReduceMeanFunctor<DeviceType::GPU, T>
: ReduceFunctorBase { : ReduceFunctorBase {
ReduceMeanFunctor(const std::vector<int> axis, ReduceMeanFunctor(OpKernelContext *context,
const std::vector<int> axis,
const bool keep_dims) const bool keep_dims)
: ReduceFunctorBase(axis, keep_dims) {} : ReduceFunctorBase(context, axis, keep_dims) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output_tensor, Tensor *output_tensor,
......
...@@ -19,17 +19,14 @@ ...@@ -19,17 +19,14 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ReshapeFunctor { struct ReshapeFunctor : OpKernel {
ReshapeFunctor() {} explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<index_t> &out_shape, const std::vector<index_t> &out_shape,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -137,10 +138,11 @@ inline void ResizeImage(const float *images, ...@@ -137,10 +138,11 @@ inline void ResizeImage(const float *images,
} }
} }
struct ResizeBicubicFunctorBase { struct ResizeBicubicFunctorBase : OpKernel {
ResizeBicubicFunctorBase(const std::vector<index_t> &size, ResizeBicubicFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners) bool align_corners)
: align_corners_(align_corners) { : OpKernel(context), align_corners_(align_corners) {
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
out_height_ = size[0]; out_height_ = size[0];
out_width_ = size[1]; out_width_ = size[1];
...@@ -158,8 +160,10 @@ struct ResizeBicubicFunctor; ...@@ -158,8 +160,10 @@ struct ResizeBicubicFunctor;
template<> template<>
struct ResizeBicubicFunctor<DeviceType::CPU, float> struct ResizeBicubicFunctor<DeviceType::CPU, float>
: ResizeBicubicFunctorBase { : ResizeBicubicFunctorBase {
ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBicubicFunctor(OpKernelContext *context,
: ResizeBicubicFunctorBase(size, align_corners) {} const std::vector<index_t> &size,
bool align_corners)
: ResizeBicubicFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -204,8 +208,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float> ...@@ -204,8 +208,10 @@ struct ResizeBicubicFunctor<DeviceType::CPU, float>
template<typename T> template<typename T>
struct ResizeBicubicFunctor<DeviceType::GPU, T> struct ResizeBicubicFunctor<DeviceType::GPU, T>
: ResizeBicubicFunctorBase { : ResizeBicubicFunctorBase {
ResizeBicubicFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBicubicFunctor(OpKernelContext *context,
: ResizeBicubicFunctorBase(size, align_corners) {} const std::vector<index_t> &size,
bool align_corners)
: ResizeBicubicFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -113,10 +114,12 @@ inline void ResizeImage(const float *images, ...@@ -113,10 +114,12 @@ inline void ResizeImage(const float *images,
} }
} }
struct ResizeBilinearFunctorBase { struct ResizeBilinearFunctorBase : OpKernel {
ResizeBilinearFunctorBase(const std::vector<index_t> &size, ResizeBilinearFunctorBase(OpKernelContext *context,
const std::vector<index_t> &size,
bool align_corners) bool align_corners)
: align_corners_(align_corners) { : OpKernel(context),
align_corners_(align_corners) {
MACE_CHECK(size.size() == 2); MACE_CHECK(size.size() == 2);
out_height_ = size[0]; out_height_ = size[0];
out_width_ = size[1]; out_width_ = size[1];
...@@ -134,8 +137,10 @@ struct ResizeBilinearFunctor; ...@@ -134,8 +137,10 @@ struct ResizeBilinearFunctor;
template<> template<>
struct ResizeBilinearFunctor<DeviceType::CPU, float> struct ResizeBilinearFunctor<DeviceType::CPU, float>
: ResizeBilinearFunctorBase { : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(OpKernelContext *context,
: ResizeBilinearFunctorBase(size, align_corners) {} const std::vector<index_t> &size,
bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -187,8 +192,10 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float> ...@@ -187,8 +192,10 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
template<typename T> template<typename T>
struct ResizeBilinearFunctor<DeviceType::GPU, T> struct ResizeBilinearFunctor<DeviceType::GPU, T>
: ResizeBilinearFunctorBase { : ResizeBilinearFunctorBase {
ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners) ResizeBilinearFunctor(OpKernelContext *context,
: ResizeBilinearFunctorBase(size, align_corners) {} const std::vector<index_t> &size,
bool align_corners)
: ResizeBilinearFunctorBase(context, size, align_corners) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0, ...@@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0,
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ScalarMathFunctor { struct ScalarMathFunctor : OpKernel {
explicit ScalarMathFunctor(const EltwiseType type, ScalarMathFunctor(OpKernelContext *context,
const std::vector<float> &coeff, const EltwiseType type,
const float scalar_input, const std::vector<float> &coeff,
const int32_t scalar_input_index) const float scalar_input,
: type_(type), const int32_t scalar_input_index)
: OpKernel(context),
type_(type),
coeff_(coeff), coeff_(coeff),
scalar_input_(scalar_input), scalar_input_(scalar_input),
scalar_input_index_(scalar_input_index) {} scalar_input_index_(scalar_input_index) {}
......
...@@ -89,7 +89,7 @@ typedef Major PackOrder; ...@@ -89,7 +89,7 @@ typedef Major PackOrder;
template<typename T> template<typename T>
class PackedBlock { class PackedBlock {
public: public:
PackedBlock() : data_tensor_(GetDeviceAllocator(CPU), PackedBlock() : data_tensor_(GetCPUAllocator(),
DataTypeToEnum<T>::v()) {} DataTypeToEnum<T>::v()) {}
const T *data() { const T *data() {
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/kernels/fixpoint.h" #include "mace/kernels/fixpoint.h"
#include "mace/kernels/gemmlowp_util.h" #include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/kernel.h"
#include "mace/kernels/quantize.h" #include "mace/kernels/quantize.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -40,7 +41,8 @@ template<DeviceType D, typename T> ...@@ -40,7 +41,8 @@ template<DeviceType D, typename T>
struct SoftmaxFunctor; struct SoftmaxFunctor;
template<> template<>
struct SoftmaxFunctor<DeviceType::CPU, float> { struct SoftmaxFunctor<DeviceType::CPU, float> : OpKernel {
explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6; ...@@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6;
static const int kSumExpIntBits = 12; static const int kSumExpIntBits = 12;
template<> template<>
struct SoftmaxFunctor<DeviceType::CPU, uint8_t> { struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
...@@ -354,7 +357,8 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> { ...@@ -354,7 +357,8 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct SoftmaxFunctor<DeviceType::GPU, T> { struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
MaceStatus operator()(const Tensor *logits, MaceStatus operator()(const Tensor *logits,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -30,11 +30,13 @@ ...@@ -30,11 +30,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct SpaceToBatchFunctorBase { struct SpaceToBatchFunctorBase : OpKernel {
SpaceToBatchFunctorBase(const std::vector<int> &paddings, SpaceToBatchFunctorBase(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s) bool b2s)
: paddings_(paddings.begin(), paddings.end()), : OpKernel(context),
paddings_(paddings.begin(), paddings.end()),
block_shape_(block_shape.begin(), block_shape.end()), block_shape_(block_shape.begin(), block_shape.end()),
b2s_(b2s) { b2s_(b2s) {
MACE_CHECK( MACE_CHECK(
...@@ -135,10 +137,11 @@ struct SpaceToBatchFunctor; ...@@ -135,10 +137,11 @@ struct SpaceToBatchFunctor;
template<> template<>
struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s) bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
...@@ -319,10 +322,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase { ...@@ -319,10 +322,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template <typename T> template <typename T>
struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase { struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
SpaceToBatchFunctor(const std::vector<int> &paddings, SpaceToBatchFunctor(OpKernelContext *context,
const std::vector<int> &paddings,
const std::vector<int> &block_shape, const std::vector<int> &block_shape,
bool b2s) bool b2s)
: SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {}
MaceStatus operator()(Tensor *space_tensor, MaceStatus operator()(Tensor *space_tensor,
Tensor *batch_tensor, Tensor *batch_tensor,
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -31,15 +32,17 @@ ...@@ -31,15 +32,17 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct SplitFunctorBase { struct SplitFunctorBase : OpKernel {
explicit SplitFunctorBase(const int32_t axis) : axis_(axis) {} SplitFunctorBase(OpKernelContext *context, const int32_t axis)
: OpKernel(context), axis_(axis) {}
int32_t axis_; int32_t axis_;
}; };
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct SplitFunctor : SplitFunctorBase { struct SplitFunctor : SplitFunctorBase {
explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} SplitFunctor(OpKernelContext *context, const int32_t axis)
: SplitFunctorBase(context, axis) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
...@@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase { ...@@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
template<typename T> template<typename T>
struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase { struct SplitFunctor<DeviceType::GPU, T> : SplitFunctorBase {
explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} SplitFunctor(OpKernelContext *context, const int32_t axis)
: SplitFunctorBase(context, axis) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
std::unique_ptr<BufferBase> kernel_error_; std::unique_ptr<BufferBase> kernel_error_;
......
...@@ -22,14 +22,16 @@ ...@@ -22,14 +22,16 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct StackFunctor { struct StackFunctor : OpKernel {
explicit StackFunctor(int axis) : axis_(axis) {} StackFunctor(OpKernelContext *context, int axis)
: OpKernel(context), axis_(axis) {}
MaceStatus operator()(const std::vector<const Tensor *> &inputs, MaceStatus operator()(const std::vector<const Tensor *> &inputs,
Tensor *output, Tensor *output,
......
...@@ -21,26 +21,29 @@ ...@@ -21,26 +21,29 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct StridedSliceFunctor { struct StridedSliceFunctor : OpKernel {
StridedSliceFunctor(int begin_mask, StridedSliceFunctor(OpKernelContext *context,
int begin_mask,
int end_mask, int end_mask,
int ellipsis_mask, int ellipsis_mask,
int new_axis_mask, int new_axis_mask,
int shrink_axis_mask, int shrink_axis_mask,
bool is_slice) bool is_slice)
: begin_mask_(begin_mask), : OpKernel(context),
begin_mask_(begin_mask),
end_mask_(end_mask), end_mask_(end_mask),
ellipsis_mask_(ellipsis_mask), ellipsis_mask_(ellipsis_mask),
new_axis_mask_(new_axis_mask), new_axis_mask_(new_axis_mask),
shrink_axis_mask_(shrink_axis_mask), shrink_axis_mask_(shrink_axis_mask),
is_slice_(is_slice), is_slice_(is_slice),
tmp_strides_tensor_(GetDeviceAllocator(D), tmp_strides_tensor_(context->device()->allocator(),
DataTypeToEnum<int32_t>::v()) {} DataTypeToEnum<int32_t>::v()) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
......
...@@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input, ...@@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input,
} }
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct TransposeFunctor { struct TransposeFunctor : OpKernel {
explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {} TransposeFunctor(OpKernelContext *context, const std::vector<int> &dims)
: OpKernel(context), dims_(dims) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
......
...@@ -22,14 +22,16 @@ ...@@ -22,14 +22,16 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct UnstackFunctor { struct UnstackFunctor : OpKernel {
explicit UnstackFunctor(int axis) : axis_(axis) {} UnstackFunctor(OpKernelContext *context, int axis)
: OpKernel(context), axis_(axis) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
const std::vector<Tensor *> &outputs, const std::vector<Tensor *> &outputs,
......
...@@ -30,11 +30,13 @@ ...@@ -30,11 +30,13 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
struct WinogradTransformFunctorBase { struct WinogradTransformFunctorBase : OpKernel {
WinogradTransformFunctorBase(const Padding &padding_type, WinogradTransformFunctorBase(OpKernelContext *context,
const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int block_size) const int block_size)
: strides_({1, 1}), : OpKernel(context),
strides_({1, 1}),
dilations_({1, 1}), dilations_({1, 1}),
padding_type_(padding_type), padding_type_(padding_type),
paddings_(paddings), paddings_(paddings),
...@@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase { ...@@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct WinogradTransformFunctor : WinogradTransformFunctorBase { struct WinogradTransformFunctor : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(OpKernelContext *context,
const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int block_size) const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {} : WinogradTransformFunctorBase(context,
padding_type,
paddings,
block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase { ...@@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
template<typename T> template<typename T>
struct WinogradTransformFunctor<DeviceType::GPU, T> struct WinogradTransformFunctor<DeviceType::GPU, T>
: WinogradTransformFunctorBase { : WinogradTransformFunctorBase {
WinogradTransformFunctor(const Padding &padding_type, WinogradTransformFunctor(OpKernelContext *context,
const Padding &padding_type,
const std::vector<int> &paddings, const std::vector<int> &paddings,
const int block_size) const int block_size)
: WinogradTransformFunctorBase(padding_type, paddings, block_size) {} : WinogradTransformFunctorBase(context,
padding_type,
paddings,
block_size) {}
MaceStatus operator()(const Tensor *input, MaceStatus operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -85,11 +95,13 @@ struct WinogradTransformFunctor<DeviceType::GPU, T> ...@@ -85,11 +95,13 @@ struct WinogradTransformFunctor<DeviceType::GPU, T>
}; };
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
struct WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctorBase : OpKernel {
WinogradInverseTransformFunctorBase(const ActivationType activation, WinogradInverseTransformFunctorBase(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const int block_size) const int block_size)
: wino_blk_size_(block_size), : OpKernel(context),
wino_blk_size_(block_size),
activation_(activation), activation_(activation),
relux_max_limit_(relux_max_limit) {} relux_max_limit_(relux_max_limit) {}
...@@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase { ...@@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const ActivationType activation, WinogradInverseTransformFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const int block_size) const int block_size)
: WinogradInverseTransformFunctorBase( : WinogradInverseTransformFunctorBase(
activation, relux_max_limit, block_size) {} context, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const std::vector<const Tensor*> &inputs, MaceStatus operator()(const std::vector<const Tensor*> &inputs,
Tensor *output, Tensor *output,
...@@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
template <typename T> template <typename T>
struct WinogradInverseTransformFunctor<DeviceType::GPU, T> struct WinogradInverseTransformFunctor<DeviceType::GPU, T>
: WinogradInverseTransformFunctorBase { : WinogradInverseTransformFunctorBase {
WinogradInverseTransformFunctor(const ActivationType activation, WinogradInverseTransformFunctor(OpKernelContext *context,
const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const int block_size) const int block_size)
: WinogradInverseTransformFunctorBase( : WinogradInverseTransformFunctorBase(
activation, relux_max_limit, block_size) {} context, activation, relux_max_limit, block_size) {}
MaceStatus operator()(const std::vector<const Tensor*> &inputs, MaceStatus operator()(const std::vector<const Tensor*> &inputs,
Tensor *output, Tensor *output,
......
...@@ -21,10 +21,12 @@ ...@@ -21,10 +21,12 @@
#include <memory> #include <memory>
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/device_context.h"
#include "mace/ops/ops_register.h" #include "mace/ops/ops_register.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data, ...@@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data,
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
MaceStatus CheckGPUAvalibility(const NetDef *net_def) { MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
// Check OpenCL avaliable // Check OpenCL avaliable
auto runtime = OpenCLRuntime::Global(); auto runtime = device->opencl_runtime();
if (!runtime->is_opencl_avaliable()) { if (!runtime->is_opencl_avaliable()) {
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
} }
...@@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) { ...@@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) {
} // namespace } // namespace
class GPUContextBuilder::Impl {
public:
void SetStoragePath(const std::string &path);
void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
void SetOpenCLParameterPath(const std::string &path);
std::shared_ptr<GPUContext> Finalize();
public:
std::string storage_path_;
std::vector<std::string> opencl_binary_paths_;
std::string opencl_parameter_path_;
};
void GPUContextBuilder::Impl::SetStoragePath(const std::string &path) {
storage_path_ = path;
}
void GPUContextBuilder::Impl::SetOpenCLBinaryPaths(
const std::vector<std::string> &paths) {
opencl_binary_paths_ = paths;
}
void GPUContextBuilder::Impl::SetOpenCLParameterPath(
const std::string &path) {
opencl_parameter_path_ = path;
}
std::shared_ptr<GPUContext> GPUContextBuilder::Impl::Finalize() {
return std::shared_ptr<GPUContext>(new GPUContext(storage_path_,
opencl_binary_paths_,
opencl_parameter_path_));
}
GPUContextBuilder::GPUContextBuilder() : impl_(new GPUContextBuilder::Impl) {}
GPUContextBuilder::~GPUContextBuilder() = default;
GPUContextBuilder &GPUContextBuilder::SetStoragePath(const std::string &path) {
impl_->SetStoragePath(path);
return *this;
}
GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths(
const std::vector<std::string> &paths) {
impl_->SetOpenCLBinaryPaths(paths);
return *this;
}
GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath(
const std::string &path) {
impl_->SetOpenCLParameterPath(path);
return *this;
}
std::shared_ptr<GPUContext> GPUContextBuilder::Finalize() {
return impl_->Finalize();
}
class MaceEngineConfig::Impl {
public:
explicit Impl(const DeviceType device_type);
~Impl() = default;
MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
MaceStatus SetCPUThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp);
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids);
inline DeviceType device_type() const {
return device_type_;
}
inline int num_threads() const {
return num_threads_;
}
inline std::shared_ptr<GPUContext> gpu_context() const {
return gpu_context_;
}
inline GPUPriorityHint gpu_priority_hint() const {
return gpu_priority_hint_;
}
inline GPUPerfHint gpu_perf_hint() const {
return gpu_perf_hint_;
}
private:
DeviceType device_type_;
int num_threads_;
std::shared_ptr<GPUContext> gpu_context_;
GPUPriorityHint gpu_priority_hint_;
GPUPerfHint gpu_perf_hint_;
};
MaceEngineConfig::Impl::Impl(const DeviceType device_type)
: device_type_(device_type),
num_threads_(-1),
gpu_context_(new GPUContext),
gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
MaceStatus MaceEngineConfig::Impl::SetGPUContext(
std::shared_ptr<GPUContext> context) {
gpu_context_ = context;
return MACE_SUCCESS;
}
MaceStatus MaceEngineConfig::Impl::SetGPUHints(
GPUPerfHint perf_hint,
GPUPriorityHint priority_hint) {
gpu_perf_hint_ = perf_hint;
gpu_priority_hint_ = priority_hint;
return MACE_SUCCESS;
}
MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
int num_threads,
CPUAffinityPolicy policy,
bool use_gemmlowp) {
num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityPolicy(
num_threads, policy, use_gemmlowp);
}
MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
}
MaceEngineConfig::MaceEngineConfig(
const DeviceType device_type)
: impl_(new MaceEngineConfig::Impl(device_type)) {}
MaceEngineConfig::~MaceEngineConfig() = default;
MaceStatus MaceEngineConfig::SetGPUContext(
std::shared_ptr<GPUContext> context) {
return impl_->SetGPUContext(context);
}
MaceStatus MaceEngineConfig::SetGPUHints(
GPUPerfHint perf_hint,
GPUPriorityHint priority_hint) {
return impl_->SetGPUHints(perf_hint, priority_hint);
}
MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
int num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp) {
return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
}
MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
}
DeviceType MaceEngineConfig::device_type() const {
return impl_->device_type();
}
int MaceEngineConfig::num_threads() const {
return impl_->num_threads();
}
std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
return impl_->gpu_context();
}
GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
return impl_->gpu_perf_hint();
}
GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
return impl_->gpu_priority_hint();
}
// Mace Tensor // Mace Tensor
class MaceTensor::Impl { class MaceTensor::Impl {
public: public:
...@@ -155,7 +350,7 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; } ...@@ -155,7 +350,7 @@ std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
// Mace Engine // Mace Engine
class MaceEngine::Impl { class MaceEngine::Impl {
public: public:
explicit Impl(DeviceType device_type); explicit Impl(const MaceEngineConfig &config);
~Impl(); ~Impl();
...@@ -178,6 +373,7 @@ class MaceEngine::Impl { ...@@ -178,6 +373,7 @@ class MaceEngine::Impl {
size_t model_data_size_; size_t model_data_size_;
std::shared_ptr<OperatorRegistryBase> op_registry_; std::shared_ptr<OperatorRegistryBase> op_registry_;
DeviceType device_type_; DeviceType device_type_;
std::unique_ptr<Device> device_;
std::unique_ptr<Workspace> ws_; std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
std::map<std::string, mace::InputInfo> input_info_map_; std::map<std::string, mace::InputInfo> input_info_map_;
...@@ -189,11 +385,12 @@ class MaceEngine::Impl { ...@@ -189,11 +385,12 @@ class MaceEngine::Impl {
MACE_DISABLE_COPY_AND_ASSIGN(Impl); MACE_DISABLE_COPY_AND_ASSIGN(Impl);
}; };
MaceEngine::Impl::Impl(DeviceType device_type) MaceEngine::Impl::Impl(const MaceEngineConfig &config)
: model_data_(nullptr), : model_data_(nullptr),
model_data_size_(0), model_data_size_(0),
op_registry_(new OperatorRegistry()), op_registry_(new OperatorRegistry()),
device_type_(device_type), device_type_(config.device_type()),
device_(nullptr),
ws_(new Workspace()), ws_(new Workspace()),
net_(nullptr) net_(nullptr)
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
...@@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type) ...@@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type)
#endif #endif
{ {
LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
device_.reset(new CPUDevice(config.num_threads()));
}
#ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) {
device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
config.gpu_context()->opencl_cache_storage(),
config.gpu_priority_hint(),
config.gpu_perf_hint(),
config.gpu_context()->opencl_binary_storage(),
config.num_threads()));
}
#endif
} }
MaceStatus MaceEngine::Impl::Init( MaceStatus MaceEngine::Impl::Init(
...@@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init(
// Check avalibility // Check avalibility
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) { if (device_type_ == DeviceType::GPU) {
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def)); MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
} }
#endif #endif
// Get input and output information. // Get input and output information.
...@@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init(
<< MakeString(MapKeys(input_info_map_)); << MakeString(MapKeys(input_info_map_));
} }
ws_->CreateTensor(MakeString("mace_input_node_", input_name), ws_->CreateTensor(MakeString("mace_input_node_", input_name),
GetDeviceAllocator(device_type_), DT_FLOAT); device_->allocator(), DT_FLOAT);
} }
for (auto output_name : output_nodes) { for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) { if (output_info_map_.find(output_name) == output_info_map_.end()) {
...@@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init(
<< MakeString(MapKeys(output_info_map_)); << MakeString(MapKeys(output_info_map_));
} }
ws_->CreateTensor(MakeString("mace_output_node_", output_name), ws_->CreateTensor(MakeString("mace_output_node_", output_name),
GetDeviceAllocator(device_type_), DT_FLOAT); device_->allocator(), DT_FLOAT);
} }
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
if (device_type_ == HEXAGON) { if (device_type_ == HEXAGON) {
...@@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init(
} }
} else { } else {
#endif #endif
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor( MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
*net_def, device_type_, model_data)); device_.get(),
model_data));
// Init model // Init model
auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(),
NetMode::INIT); NetMode::INIT);
MACE_RETURN_IF_ERROR(net->Run()); MACE_RETURN_IF_ERROR(net->Run());
net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get());
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
#endif #endif
if (device_type_ == DeviceType::GPU) { if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(*net_def, model_data); ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run(
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
if (device_type_ == GPU) { if (device_type_ == GPU) {
OpenCLRuntime::Global()->SaveBuiltCLProgram(); device_->opencl_runtime()->SaveBuiltCLProgram();
} }
#endif #endif
for (auto &output : *outputs) { for (auto &output : *outputs) {
...@@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run(
return MACE_SUCCESS; return MACE_SUCCESS;
} }
MaceEngine::MaceEngine(DeviceType device_type): MaceEngine::MaceEngine(const MaceEngineConfig &config):
impl_(new MaceEngine::Impl(device_type)) {} impl_(new MaceEngine::Impl(config)) {}
MaceEngine::~MaceEngine() = default; MaceEngine::~MaceEngine() = default;
...@@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto( ...@@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto(
const std::string &model_data_file, const std::string &model_data_file,
const std::vector<std::string> &input_nodes, const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes, const std::vector<std::string> &output_nodes,
const DeviceType device_type, const MaceEngineConfig &config,
std::shared_ptr<MaceEngine> *engine) { std::shared_ptr<MaceEngine> *engine) {
LOG(INFO) << "Create MaceEngine from model pb"; LOG(INFO) << "Create MaceEngine from model pb";
// load model // load model
...@@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto( ...@@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto(
std::shared_ptr<NetDef> net_def(new NetDef()); std::shared_ptr<NetDef> net_def(new NetDef());
net_def->ParseFromArray(&model_pb[0], model_pb.size()); net_def->ParseFromArray(&model_pb[0], model_pb.size());
engine->reset(new mace::MaceEngine(device_type)); engine->reset(new mace::MaceEngine(config));
MaceStatus status = (*engine)->Init( MaceStatus status = (*engine)->Init(
net_def.get(), input_nodes, output_nodes, model_data_file); net_def.get(), input_nodes, output_nodes, model_data_file);
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/core/file_storage.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
class FileStorageFactory::Impl {
public:
explicit Impl(const std::string &path);
std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
private:
std::string path_;
};
FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
const std::string &name) {
return std::move(std::unique_ptr<KVStorage>(
new FileStorage(path_ + "/" + name)));
}
FileStorageFactory::FileStorageFactory(const std::string &path):
impl_(new FileStorageFactory::Impl(path)) {}
FileStorageFactory::~FileStorageFactory() = default;
std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
const std::string &name) {
return impl_->CreateStorage(name);
}
extern std::shared_ptr<KVStorageFactory> kStorageFactory;
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory) {
VLOG(1) << "Set internal KV Storage Engine";
kStorageFactory = storage_factory;
}
// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe)
void SetOpenCLBinaryPaths(const std::vector<std::string> &paths) {
#ifdef MACE_ENABLE_OPENCL
OpenCLRuntime::ConfigureOpenCLBinaryPath(paths);
#else
MACE_UNUSED(paths);
#endif // MACE_ENABLE_OPENCL
}
extern std::string kOpenCLParameterPath;
void SetOpenCLParameterPath(const std::string &path) {
#ifdef MACE_ENABLE_OPENCL
kOpenCLParameterPath = path;
#else
MACE_UNUSED(path);
#endif // MACE_ENABLE_OPENCL
}
void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
#ifdef MACE_ENABLE_OPENCL
VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
<< ", gpu_priority_hint: " << gpu_priority_hint;
OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
#else
MACE_UNUSED(gpu_perf_hint);
MACE_UNUSED(gpu_priority_hint);
#endif // MACE_ENABLE_OPENCL
}
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp) {
VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
<< ", affinity policy: " << policy;
return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint,
policy,
use_gemmlowp);
}
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids) {
return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
}
MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) {
return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids);
}
}; // namespace mace
mace { mace {
global: global:
*GPUContextBuilder*;
*MaceEngineConfig*;
*MaceTensor*; *MaceTensor*;
*MaceEngine*; *MaceEngine*;
*CreateMaceEngineFromProto*; *CreateMaceEngineFromProto*;
*FileStorageFactory*;
*SetKVStorageFactory*;
*SetOpenCLBinaryPaths*;
*SetOpenCLParameterPath*;
*SetGPUHints*;
*SetOpenMPThreadPolicy*;
*SetOpenMPThreadAffinity*;
*GetBigLittleCoreIDs*; *GetBigLittleCoreIDs*;
*MaceVersion*; *MaceVersion*;
......
...@@ -23,8 +23,25 @@ cc_library( ...@@ -23,8 +23,25 @@ cc_library(
hdrs = [ hdrs = [
"ops_test_util.h", "ops_test_util.h",
], ],
srcs = [
"ops_test_util.cc",
],
copts = [
"-Werror",
"-Wextra",
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
]) + if_android_armv7([
"-mfloat-abi=softfp",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
deps = [ deps = [
"//mace/core", "//mace/ops",
"@gtest", "@gtest",
], ],
) )
...@@ -36,6 +53,7 @@ cc_library( ...@@ -36,6 +53,7 @@ cc_library(
exclude = [ exclude = [
"*_test.cc", "*_test.cc",
"*_benchmark.cc", "*_benchmark.cc",
"ops_test_util.cc",
"buffer_to_image.cc", "buffer_to_image.cc",
"image_to_buffer.cc", "image_to_buffer.cc",
"lstmcell.cc", "lstmcell.cc",
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ActivationOp : public Operator<D, T> { class ActivationOp : public Operator<D, T> {
public: public:
ActivationOp(const OperatorDef &operator_def, Workspace *ws) ActivationOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(kernels::StringToActivationType( functor_(context,
kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
static_cast<T>( static_cast<T>(
......
...@@ -58,7 +58,7 @@ void TestSimpleRelu() { ...@@ -58,7 +58,7 @@ void TestSimpleRelu() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() { ...@@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -159,7 +159,7 @@ void TestSimpleRelux() { ...@@ -159,7 +159,7 @@ void TestSimpleRelux() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -209,7 +209,7 @@ void TestSimpleReluRelux() { ...@@ -209,7 +209,7 @@ void TestSimpleReluRelux() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -267,7 +267,7 @@ void TestSimplePrelu() { ...@@ -267,7 +267,7 @@ void TestSimplePrelu() {
} }
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
{-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -318,7 +318,7 @@ void TestSimpleTanh() { ...@@ -318,7 +318,7 @@ void TestSimpleTanh() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
{-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092, {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092,
-0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758, -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758,
...@@ -371,7 +371,7 @@ void TestSimpleSigmoid() { ...@@ -371,7 +371,7 @@ void TestSimpleSigmoid() {
net.RunOp(D); net.RunOp(D);
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
{9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01, {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01,
6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01, 6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01,
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class AddNOp : public Operator<D, T> { class AddNOp : public Operator<D, T> {
public: public:
AddNOp(const OperatorDef &operator_def, Workspace *ws) AddNOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, context), functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
Tensor *output_tensor = this->Output(0); Tensor *output_tensor = this->Output(0);
......
...@@ -39,7 +39,7 @@ void SimpleAdd2() { ...@@ -39,7 +39,7 @@ void SimpleAdd2() {
// Run // Run
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); auto expected = net.CreateTensor<float>({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -98,7 +98,7 @@ void SimpleAdd3() { ...@@ -98,7 +98,7 @@ void SimpleAdd3() {
} }
auto expected = auto expected =
CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4, 1e-3);
} }
...@@ -136,8 +136,8 @@ void RandomTest() { ...@@ -136,8 +136,8 @@ void RandomTest() {
// run on cpu // run on cpu
net.RunOp(); net.RunOp();
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
for (int i = 0; i < input_num; ++i) { for (int i = 0; i < input_num; ++i) {
...@@ -160,7 +160,7 @@ void RandomTest() { ...@@ -160,7 +160,7 @@ void RandomTest() {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2); 1e-2);
} }
} }
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class ArgMaxOp : public Operator<D, T> { class ArgMaxOp : public Operator<D, T> {
public: public:
ArgMaxOp(const OperatorDef &operator_def, Workspace *ws) ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, context), functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector<index_t> &input_shape, ...@@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector<index_t> &input_shape,
} }
// Check // Check
auto expected = CreateTensor<int32_t>(output_shape, output); auto expected = net.CreateTensor<int32_t>(output_shape, output);
ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<int32_t>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
......
...@@ -25,9 +25,9 @@ namespace ops { ...@@ -25,9 +25,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class BatchNormOp : public Operator<D, T> { class BatchNormOp : public Operator<D, T> {
public: public:
BatchNormOp(const OperatorDef &operator_def, Workspace *ws) BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(false, kernels::ActivationType::NOOP, 0.0f) { functor_(context, false, kernels::ActivationType::NOOP, 0.0f) {
epsilon_ = OperatorBase::GetOptionalArg<float>("epsilon", epsilon_ = OperatorBase::GetOptionalArg<float>("epsilon",
static_cast<float>(1e-4)); static_cast<float>(1e-4));
} }
...@@ -52,7 +52,8 @@ class BatchNormOp : public Operator<D, T> { ...@@ -52,7 +52,8 @@ class BatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
MACE_RETURN_IF_ERROR(output->ResizeLike(input)); MACE_RETURN_IF_ERROR(output->ResizeLike(input));
return functor_(input, scale, offset, mean, var, epsilon_, output, future); return functor_(input, scale, offset,
mean, var, epsilon_, output, future);
} }
private: private:
......
...@@ -79,7 +79,7 @@ void Simple() { ...@@ -79,7 +79,7 @@ void Simple() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
...@@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
} }
TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
...@@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
...@@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2);
} }
TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
...@@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
} }
TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
...@@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
...@@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2);
} }
} // namespace test } // namespace test
......
...@@ -27,9 +27,10 @@ namespace ops { ...@@ -27,9 +27,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> { class BatchToSpaceNDOp : public Operator<D, T> {
public: public:
BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}), functor_(context,
OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}), OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
true) {} true) {}
......
...@@ -24,10 +24,11 @@ namespace ops { ...@@ -24,10 +24,11 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class BiasAddOp : public Operator<D, T> { class BiasAddOp : public Operator<D, T> {
public: public:
BiasAddOp(const OperatorDef &operator_def, Workspace *ws) BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>( functor_(context,
"data_format", NHWC))) {} static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
"data_format", NHWC))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -66,7 +66,7 @@ void BiasAddSimple() { ...@@ -66,7 +66,7 @@ void BiasAddSimple() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 6, 2, 1}, {1, 6, 2, 1},
{5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5}); {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5});
...@@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
...@@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} // namespace test } // namespace test
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> { class BufferToImageOp : public Operator<D, T> {
public: public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws) BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {} functor_(context,
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input_tensor = this->Input(INPUT); const Tensor *input_tensor = this->Input(INPUT);
......
...@@ -25,8 +25,8 @@ namespace ops { ...@@ -25,8 +25,8 @@ namespace ops {
template <DeviceType D, typename SrcType> template <DeviceType D, typename SrcType>
class CastOp : public Operator<D, SrcType> { class CastOp : public Operator<D, SrcType> {
public: public:
CastOp(const OperatorDef &op_def, Workspace *ws) CastOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, SrcType>(op_def, ws) {} : Operator<D, SrcType>(op_def, context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
MACE_UNUSED(future); MACE_UNUSED(future);
......
...@@ -26,10 +26,10 @@ namespace ops { ...@@ -26,10 +26,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ChannelShuffleOp : public Operator<D, T> { class ChannelShuffleOp : public Operator<D, T> {
public: public:
ChannelShuffleOp(const OperatorDef &operator_def, Workspace *ws) ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
group_(OperatorBase::GetOptionalArg<int>("group", 1)), group_(OperatorBase::GetOptionalArg<int>("group", 1)),
functor_(this->group_) {} functor_(context, this->group_) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { ...@@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}); {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 16}, {1, 1, 2, 16},
{0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ConcatOp : public Operator<D, T> { class ConcatOp : public Operator<D, T> {
public: public:
ConcatOp(const OperatorDef &op_def, Workspace *ws) ConcatOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {} functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
MACE_CHECK(this->InputSize() >= 2) MACE_CHECK(this->InputSize() >= 2)
......
...@@ -28,9 +28,10 @@ namespace ops { ...@@ -28,9 +28,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class Conv2dOp : public ConvPool2dOpBase<D, T> { class Conv2dOp : public ConvPool2dOpBase<D, T> {
public: public:
Conv2dOp(const OperatorDef &op_def, Workspace *ws) Conv2dOp(const OperatorDef &op_def, OpKernelContext *context)
: ConvPool2dOpBase<D, T>(op_def, ws), : ConvPool2dOpBase<D, T>(op_def, context),
functor_(this->strides_.data(), functor_(context,
this->strides_.data(),
this->padding_type_, this->padding_type_,
this->paddings_, this->paddings_,
this->dilations_.data(), this->dilations_.data(),
...@@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f), OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
static_cast<bool>(OperatorBase::GetOptionalArg<int>( static_cast<bool>(OperatorBase::GetOptionalArg<int>(
"is_filter_transformed", false)), "is_filter_transformed", false)),
ws->GetScratchBuffer(D)) {} context->workspace()->GetScratchBuffer(D)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() { ...@@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f}); auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() { ...@@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 3, 3, 1}, {1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
...@@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() { ...@@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() {
} }
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.0f}); auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {18.0f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -298,7 +298,7 @@ void TestNHWCCombined3x3() { ...@@ -298,7 +298,7 @@ void TestNHWCCombined3x3() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() { ...@@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f}); auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
} }
template <DeviceType D, typename T> template <DeviceType D, typename T>
...@@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() { ...@@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() {
} }
// Check // Check
auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f}); auto expected = net.CreateTensor<float>({1, 1, 1, 1}, {0.0f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
} }
...@@ -515,7 +515,7 @@ void TestConv1x1() { ...@@ -515,7 +515,7 @@ void TestConv1x1() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 3, 10, 2}, {1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
...@@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape, ...@@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
...@@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape, ...@@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4); 1e-4);
}; };
...@@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage", BufferToImage<D, half>(&net, "Input", "InputImage",
...@@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape, ...@@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1); 1e-1);
}; };
...@@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape, ...@@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
...@@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape, ...@@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4); 1e-4);
}; };
...@@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, half>(&net, "Input", "InputImage", BufferToImage<D, half>(&net, "Input", "InputImage",
...@@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape, ...@@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1); 1e-1);
}; };
...@@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, ...@@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
...@@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, ...@@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4); 1e-4);
}; };
...@@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() { ...@@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() {
// Run // Run
net.Run(); net.Run();
// Check // Check
auto expected = CreateTensor<uint8_t>({1, 1, 1, 1}, {230}); auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 1}, {230});
ExpectTensorNear<uint8_t>(*expected, *output); ExpectTensorNear<uint8_t>(*expected, *output);
} }
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ConvPool2dOpBase : public Operator<D, T> { class ConvPool2dOpBase : public Operator<D, T> {
public: public:
ConvPool2dOpBase(const OperatorDef &op_def, Workspace *ws) ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
strides_(OperatorBase::GetRepeatedArgs<int>("strides")), strides_(OperatorBase::GetRepeatedArgs<int>("strides")),
padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>( padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(SAME)))), "padding", static_cast<int>(SAME)))),
......
...@@ -21,6 +21,8 @@ namespace test { ...@@ -21,6 +21,8 @@ namespace test {
TEST(CoreTest, INIT_MODE) { TEST(CoreTest, INIT_MODE) {
std::vector<OperatorDef> op_defs; std::vector<OperatorDef> op_defs;
Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
std::unique_ptr<Tuner<uint32_t>> tuner;
Workspace ws; Workspace ws;
op_defs.emplace_back(OperatorDef()); op_defs.emplace_back(OperatorDef());
...@@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) { ...@@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
.AddIntArg("mode", static_cast<int>(NetMode::INIT)) .AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]); .Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU), Tensor *input = ws.CreateTensor("Input", device->allocator(),
DataTypeToEnum<float>::v()); DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3}); input->Resize({1, 3, 3, 3});
{ {
...@@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) { ...@@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) {
} }
std::shared_ptr<OperatorRegistryBase> op_registry(new OperatorRegistry()); std::shared_ptr<OperatorRegistryBase> op_registry(new OperatorRegistry());
auto net = auto net =
CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT); CreateNet(op_registry, net_def, &ws, device, NetMode::INIT);
net->Run(); net->Run();
EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
EXPECT_TRUE(ws.GetTensor("Output") == nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU); net = CreateNet(op_registry, net_def, &ws, device);
net->Run(); net->Run();
EXPECT_TRUE(ws.GetTensor("Output") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class CropOp : public Operator<D, T> { class CropOp : public Operator<D, T> {
public: public:
CropOp(const OperatorDef &op_def, Workspace *ws) CropOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 2), functor_(context,
OperatorBase::GetOptionalArg<int>("axis", 2),
OperatorBase::GetRepeatedArgs<int>("offset")) {} OperatorBase::GetRepeatedArgs<int>("offset")) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -75,7 +75,7 @@ void RunCrop(const std::vector<index_t> &input_shape, ...@@ -75,7 +75,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
"Output", NHWC); "Output", NHWC);
} }
// Check // Check
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output")); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
} }
} // namespace } // namespace
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class Deconv2dOp : public Operator<D, T> { class Deconv2dOp : public Operator<D, T> {
public: public:
Deconv2dOp(const OperatorDef &op_def, Workspace *ws) Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetRepeatedArgs<int>("strides"), functor_(context,
OperatorBase::GetRepeatedArgs<int>("strides"),
static_cast<Padding>(OperatorBase::GetOptionalArg<int>( static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(SAME))), "padding", static_cast<int>(SAME))),
OperatorBase::GetRepeatedArgs<int>("padding_values"), OperatorBase::GetRepeatedArgs<int>("padding_values"),
......
...@@ -79,7 +79,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -79,7 +79,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
"Output", NHWC); "Output", NHWC);
} }
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.0001);
} }
...@@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch,
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
...@@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch, ...@@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch,
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-4, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4); 1e-4);
}; };
......
...@@ -27,10 +27,10 @@ namespace ops { ...@@ -27,10 +27,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class DepthToSpaceOp : public Operator<D, T> { class DepthToSpaceOp : public Operator<D, T> {
public: public:
DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws) DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)), block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
functor_(this->block_size_, true) {} functor_(context, this->block_size_, true) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s, ...@@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<float>(expected_shape, expected_data); auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
......
...@@ -29,9 +29,10 @@ namespace ops { ...@@ -29,9 +29,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> { class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
public: public:
DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context)
: ConvPool2dOpBase<D, T>(op_def, ws), : ConvPool2dOpBase<D, T>(op_def, context),
functor_(this->strides_.data(), functor_(context,
this->strides_.data(),
this->padding_type_, this->padding_type_,
this->paddings_, this->paddings_,
this->dilations_.data(), this->dilations_.data(),
......
...@@ -80,7 +80,7 @@ void SimpleValidTest() { ...@@ -80,7 +80,7 @@ void SimpleValidTest() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 2, 2, 2}, {1, 2, 2, 2},
{37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f}); {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f});
...@@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch, ...@@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch,
} }
auto expected = auto expected =
CreateTensor<T>({1, out_height, out_width, out_channels}, expect); net.CreateTensor<T>({1, out_height, out_width, out_channels}, expect);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) {
"Output", NHWC); "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) { ...@@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) {
// Check // Check
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5, ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-4); 1e-4);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-2,
1e-2); 1e-2);
} }
}; };
...@@ -387,7 +387,7 @@ void QuantSimpleValidTest() { ...@@ -387,7 +387,7 @@ void QuantSimpleValidTest() {
net.Run(); net.Run();
// Check // Check
auto expected = CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21}); auto expected = net.CreateTensor<uint8_t>({1, 1, 1, 2}, {255, 21});
ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output")); ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"));
} }
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class EltwiseOp : public Operator<D, T> { class EltwiseOp : public Operator<D, T> {
public: public:
EltwiseOp(const OperatorDef &op_def, Workspace *ws) EltwiseOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_( functor_(
context,
static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>( static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))), "type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"), OperatorBase::GetRepeatedArgs<float>("coeff"),
......
...@@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type, ...@@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<DstType>({}, {output}); auto expected = net.CreateTensor<DstType>({}, {output});
ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, ...@@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<DstType>(shape, output); auto expected = net.CreateTensor<DstType>(shape, output);
ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, ...@@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
if (input0.size() < input1.size()) { if (input0.size() < input1.size()) {
output_shape = shape1; output_shape = shape1;
} }
auto expected = CreateTensor<DstType>(output_shape, output); auto expected = net.CreateTensor<DstType>(output_shape, output);
ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type, ...@@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type,
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<DstType>(output_shape, output); auto expected = net.CreateTensor<DstType>(output_shape, output);
ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
...@@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type,
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC); NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImg",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
} }
} }
...@@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net.RunOp(DeviceType::CPU); net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC); NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0", BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImg0",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2);
} }
} }
} // namespace } // namespace
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class FillOp : public Operator<D, T> { class FillOp : public Operator<D, T> {
public: public:
FillOp(const OperatorDef &operator_def, Workspace *ws) FillOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_() {} functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *shape = this->Input(SHAPE); const Tensor *shape = this->Input(SHAPE);
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class FoldedBatchNormOp : public Operator<D, T> { class FoldedBatchNormOp : public Operator<D, T> {
public: public:
FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws) FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(true, functor_(context,
true,
kernels::StringToActivationType( kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
......
...@@ -83,7 +83,7 @@ void Simple() { ...@@ -83,7 +83,7 @@ void Simple() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708,
3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); 3.1708, 5.5125, 5.5125, 7.8543, 7.8543});
...@@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ...@@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
} }
TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
...@@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
...@@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-2);
} }
TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
...@@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
...@@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ...@@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4);
} }
TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
...@@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
NHWC); NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
...@@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-2, 1e-2);
} }
} // namespace test } // namespace test
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> { class FullyConnectedOp : public Operator<D, T> {
public: public:
FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(kernels::StringToActivationType( functor_(context, kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {} OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
...@@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator<D, T> { ...@@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator<D, T> {
" don't match."); " don't match.");
} }
return functor_(input, weight, bias, output, future); return functor_(input, weight,
bias, output, future);
} }
private: private:
......
...@@ -76,7 +76,7 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -76,7 +76,7 @@ void Simple(const std::vector<index_t> &input_shape,
} }
// Check // Check
auto expected = CreateTensor<float>(output_shape, output_value); auto expected = net.CreateTensor<float>(output_shape, output_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -156,8 +156,8 @@ void Random(const index_t batch, ...@@ -156,8 +156,8 @@ void Random(const index_t batch,
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
// Check // Check
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
...@@ -181,10 +181,10 @@ void Random(const index_t batch, ...@@ -181,10 +181,10 @@ void Random(const index_t batch,
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-1,
1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-3); 1e-3);
} }
} }
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class GatherOp : public Operator<D, T> { class GatherOp : public Operator<D, T> {
public: public:
GatherOp(const OperatorDef &operator_def, Workspace *ws) GatherOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 0), functor_(context,
OperatorBase::GetOptionalArg<int>("axis", 0),
OperatorBase::GetOptionalArg<float>("y", 1.0)) {} OperatorBase::GetOptionalArg<float>("y", 1.0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -47,7 +47,7 @@ void TestGather(const std::vector<index_t> &weight_shape, ...@@ -47,7 +47,7 @@ void TestGather(const std::vector<index_t> &weight_shape,
// Run // Run
net.RunOp(CPU); net.RunOp(CPU);
auto expected = CreateTensor<float>(output_shape, output); auto expected = net.CreateTensor<float>(output_shape, output);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
......
...@@ -25,8 +25,8 @@ namespace ops { ...@@ -25,8 +25,8 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class IdentityOp : public Operator<D, T> { class IdentityOp : public Operator<D, T> {
public: public:
IdentityOp(const OperatorDef &op_def, Workspace *ws) IdentityOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> { class ImageToBufferOp : public Operator<D, T> {
public: public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {} functor_(context,
OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class InferConv2dShapeOp : public Operator<D, T> { class InferConv2dShapeOp : public Operator<D, T> {
public: public:
InferConv2dShapeOp(const OperatorDef &op_def, Workspace *ws) InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -24,8 +24,8 @@ namespace ops { ...@@ -24,8 +24,8 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class LocalResponseNormOp : public Operator<D, T> { class LocalResponseNormOp : public Operator<D, T> {
public: public:
LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws) LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), functor_() { : Operator<D, T>(operator_def, context), functor_(context) {
depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5); depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f); bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f); alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
......
...@@ -46,7 +46,7 @@ void Simple() { ...@@ -46,7 +46,7 @@ void Simple() {
} }
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 6}, {1, 1, 2, 6},
{0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47}); {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47});
......
...@@ -26,10 +26,12 @@ namespace ops { ...@@ -26,10 +26,12 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class LSTMCellOp : public Operator<D, T> { class LSTMCellOp : public Operator<D, T> {
public: public:
LSTMCellOp(const OperatorDef &op_def, Workspace *ws) LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(static_cast<T>( functor_(context,
OperatorBase::GetOptionalArg<float>("scalar_input", 0.0))) {} static_cast<T>(
OperatorBase::GetOptionalArg<float>("scalar_input",
0.0))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -24,8 +24,9 @@ namespace ops { ...@@ -24,8 +24,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class MatMulOp : public Operator<D, T> { class MatMulOp : public Operator<D, T> {
public: public:
MatMulOp(const OperatorDef &operator_def, Workspace *ws) MatMulOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(context),
transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)), transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) { transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
} }
...@@ -46,7 +47,8 @@ class MatMulOp : public Operator<D, T> { ...@@ -46,7 +47,8 @@ class MatMulOp : public Operator<D, T> {
MACE_CHECK(ak == bk, "the number of A's column ", ak, MACE_CHECK(ak == bk, "the number of A's column ", ak,
" must be equal to B's row ", bk); " must be equal to B's row ", bk);
return functor_(A, B, C, transpose_a_, transpose_b_, future); return functor_(A, B, C,
transpose_a_, transpose_b_, future);
} }
private: private:
......
...@@ -65,7 +65,7 @@ void Simple(const std::vector<index_t> &A_shape, ...@@ -65,7 +65,7 @@ void Simple(const std::vector<index_t> &A_shape,
} }
// Check // Check
auto expected = CreateTensor<float>(C_shape, C_value); auto expected = net.CreateTensor<float>(C_shape, C_value);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -171,15 +171,15 @@ void Complex(const std::vector<index_t> &batch, ...@@ -171,15 +171,15 @@ void Complex(const std::vector<index_t> &batch,
// Check // Check
EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape()); EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape());
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
expected.Reshape({batch_count, height, out_width}); expected->Reshape({batch_count, height, out_width});
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-1); 1e-1);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5,
1e-5); 1e-5);
} }
} }
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
OpTestContext *OpTestContext::Get() {
static OpTestContext instance;
return &instance;
}
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
}
Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get();
}
OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
new GPUDevice(gpu_context_->opencl_tuner(),
gpu_context_->opencl_cache_storage(),
GPUPriorityHint::PRIORITY_NORMAL));
}
} // namespace test
} // namespace ops
} // namespace mace
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <functional> #include <functional>
#include <limits> #include <limits>
#include <map>
#include <memory> #include <memory>
#include <random> #include <random>
#include <string> #include <string>
...@@ -26,7 +27,8 @@ ...@@ -26,7 +27,8 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/device_context.h"
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/kernels/opencl/common.h" #include "mace/kernels/opencl/common.h"
...@@ -110,9 +112,28 @@ class OpDefBuilder { ...@@ -110,9 +112,28 @@ class OpDefBuilder {
OperatorDef op_def_; OperatorDef op_def_;
}; };
class OpTestContext {
public:
static OpTestContext *Get();
std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type);
private:
OpTestContext();
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
std::shared_ptr<GPUContext> gpu_context_;
std::map<DeviceType, std::unique_ptr<Device>> device_map_;
};
class OpsTestNet { class OpsTestNet {
public: public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {} OpsTestNet() :
op_registry_(new OperatorRegistry()) {
}
~OpsTestNet() {
Sync();
}
template <DeviceType D, typename T> template <DeviceType D, typename T>
void AddInputFromArray(const std::string &name, void AddInputFromArray(const std::string &name,
...@@ -121,7 +142,8 @@ class OpsTestNet { ...@@ -121,7 +142,8 @@ class OpsTestNet {
const float scale = 0.0, const float scale = 0.0,
const int32_t zero_point = 0) { const int32_t zero_point = 0) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
...@@ -136,7 +158,8 @@ class OpsTestNet { ...@@ -136,7 +158,8 @@ class OpsTestNet {
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const T data) { const T data) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
...@@ -149,7 +172,8 @@ class OpsTestNet { ...@@ -149,7 +172,8 @@ class OpsTestNet {
bool positive = true, bool positive = true,
bool truncate = false) { bool truncate = false) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
...@@ -184,8 +208,10 @@ class OpsTestNet { ...@@ -184,8 +208,10 @@ class OpsTestNet {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void Transpose2D(const std::string &src_name, const std::string &dst_name) { void Transpose2D(const std::string &src_name, const std::string &dst_name) {
Tensor *input = ws_.GetTensor(src_name); Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), Tensor *output = ws_.CreateTensor(
DataTypeToEnum<T>::v()); dst_name,
OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 2, "input shape != 2"); MACE_CHECK(input_shape.size() == 2, "input shape != 2");
output->Resize({input_shape[1], input_shape[0]}); output->Resize({input_shape[1], input_shape[0]});
...@@ -205,8 +231,10 @@ class OpsTestNet { ...@@ -205,8 +231,10 @@ class OpsTestNet {
void CopyData(const std::string &src_name, void CopyData(const std::string &src_name,
const std::string &dst_name) { const std::string &dst_name) {
Tensor *input = ws_.GetTensor(src_name); Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), Tensor *output = ws_.CreateTensor(
DataTypeToEnum<T>::v()); dst_name,
OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
output->Resize(input_shape); output->Resize(input_shape);
...@@ -222,8 +250,10 @@ class OpsTestNet { ...@@ -222,8 +250,10 @@ class OpsTestNet {
const std::string &dst_name, const std::string &dst_name,
const DataFormat dst_format) { const DataFormat dst_format) {
Tensor *input = ws_.GetTensor(src_name); Tensor *input = ws_.GetTensor(src_name);
Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), Tensor *output = ws_.CreateTensor(
DataTypeToEnum<T>::v()); dst_name,
OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4, "input shape != 4"); MACE_CHECK(input_shape.size() == 4, "input shape != 4");
...@@ -352,8 +382,10 @@ class OpsTestNet { ...@@ -352,8 +382,10 @@ class OpsTestNet {
void FillNHWCInputToNCHWInput(const std::string &name_nchw, void FillNHWCInputToNCHWInput(const std::string &name_nchw,
const std::string &name_nhwc) { const std::string &name_nhwc) {
Tensor *input = ws_.GetTensor(name_nhwc); Tensor *input = ws_.GetTensor(name_nhwc);
Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D), Tensor *output = ws_.CreateTensor(
DataTypeToEnum<T>::v()); name_nchw,
OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape(); const std::vector<index_t> input_shape = input->shape();
index_t batch = input_shape[0]; index_t batch = input_shape[0];
index_t height = input_shape[1]; index_t height = input_shape[1];
...@@ -374,6 +406,22 @@ class OpsTestNet { ...@@ -374,6 +406,22 @@ class OpsTestNet {
} }
} }
// Create standalone tensor on device D with T type.
template <typename T, DeviceType D = DeviceType::CPU>
std::unique_ptr<Tensor> CreateTensor(
const std::vector<index_t> &shape = {},
const std::vector<T> &data = {}) {
std::unique_ptr<Tensor> res(
new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(),
DataTypeToEnum<T>::v()));
if (!data.empty()) {
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
}
return res;
}
OperatorDef *NewOperatorDef() { OperatorDef *NewOperatorDef() {
op_defs_.clear(); op_defs_.clear();
op_defs_.emplace_back(OperatorDef()); op_defs_.emplace_back(OperatorDef());
...@@ -392,8 +440,9 @@ class OpsTestNet { ...@@ -392,8 +440,9 @@ class OpsTestNet {
for (auto &op_def_ : op_defs_) { for (auto &op_def_ : op_defs_) {
net_def.add_op()->CopyFrom(op_def_); net_def.add_op()->CopyFrom(op_def_);
} }
net_ = CreateNet(op_registry_, net_def, &ws_, device); net_ = CreateNet(op_registry_, net_def, &ws_,
device_ = device; OpTestContext::Get()->GetDevice(device));
device_type_ = device;
return net_ != nullptr; return net_ != nullptr;
} }
...@@ -416,10 +465,15 @@ class OpsTestNet { ...@@ -416,10 +465,15 @@ class OpsTestNet {
MaceStatus RunOp() { return RunOp(DeviceType::CPU); } MaceStatus RunOp() { return RunOp(DeviceType::CPU); }
MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
device_ = device; device_type_ = device;
net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT); net_ = CreateNet(op_registry_,
net_def,
&ws_,
OpTestContext::Get()->GetDevice(device),
NetMode::INIT);
MACE_RETURN_IF_ERROR(net_->Run()); MACE_RETURN_IF_ERROR(net_->Run());
net_ = CreateNet(op_registry_, net_def, &ws_, device); net_ = CreateNet(op_registry_, net_def, &ws_,
OpTestContext::Get()->GetDevice(device));
return net_->Run(); return net_->Run();
} }
...@@ -432,9 +486,12 @@ class OpsTestNet { ...@@ -432,9 +486,12 @@ class OpsTestNet {
} }
void Sync() { void Sync() {
if (net_ && device_ == DeviceType::GPU) { #ifdef MACE_ENABLE_OPENCL
OpenCLRuntime::Global()->command_queue().finish(); if (net_ && device_type_ == DeviceType::GPU) {
OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime()
->command_queue().finish();
} }
#endif
} }
public: public:
...@@ -442,17 +499,17 @@ class OpsTestNet { ...@@ -442,17 +499,17 @@ class OpsTestNet {
Workspace ws_; Workspace ws_;
std::vector<OperatorDef> op_defs_; std::vector<OperatorDef> op_defs_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
DeviceType device_; DeviceType device_type_;
}; };
class OpsTestBase : public ::testing::Test { class OpsTestBase : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() {
// OpenCLRuntime::CreateGlobal(); SetOpenMPThreadsAndAffinityPolicy(-1,
CPUAffinityPolicy::AFFINITY_BIG_ONLY);
} }
virtual void TearDown() { virtual void TearDown() {
// OpenCLRuntime::DestroyGlobal();
} }
}; };
...@@ -510,17 +567,6 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) { ...@@ -510,17 +567,6 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
return std::move(dest); return std::move(dest);
} }
template <typename T>
std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) {
std::unique_ptr<Tensor> res(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
return res;
}
inline bool IsSameSize(const Tensor &x, const Tensor &y) { inline bool IsSameSize(const Tensor &x, const Tensor &y) {
if (x.dim_size() != y.dim_size()) return false; if (x.dim_size() != y.dim_size()) return false;
for (int d = 0; d < x.dim_size(); ++d) { for (int d = 0; d < x.dim_size(); ++d) {
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class PadOp : public Operator<D, T> { class PadOp : public Operator<D, T> {
public: public:
PadOp(const OperatorDef &operator_def, Workspace *ws) PadOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetRepeatedArgs<int>("paddings"), functor_(context,
OperatorBase::GetRepeatedArgs<int>("paddings"),
OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {} OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -63,7 +63,7 @@ void Simple() { ...@@ -63,7 +63,7 @@ void Simple() {
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 5, 6, 1}, { {1, 5, 6, 1}, {
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2,
1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0,
...@@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) { ...@@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) {
auto output = net.GetTensor("Output"); auto output = net.GetTensor("Output");
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 3, 3, 4}, {1, 3, 3, 4},
{ {
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
...@@ -134,8 +134,8 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -134,8 +134,8 @@ void Complex(const std::vector<index_t> &input_shape,
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC); NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage", BufferToImage<DeviceType::GPU, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -155,9 +155,9 @@ void Complex(const std::vector<index_t> &input_shape, ...@@ -155,9 +155,9 @@ void Complex(const std::vector<index_t> &input_shape,
auto output = net.GetTensor("OpenCLOutput"); auto output = net.GetTensor("OpenCLOutput");
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *output, 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *output, 1e-2, 1e-2);
} else { } else {
ExpectTensorNear<float>(expected, *output, 1e-5); ExpectTensorNear<float>(*expected, *output, 1e-5);
} }
} }
} // namespace } // namespace
......
...@@ -27,13 +27,14 @@ namespace ops { ...@@ -27,13 +27,14 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class PoolingOp : public ConvPool2dOpBase<D, T> { class PoolingOp : public ConvPool2dOpBase<D, T> {
public: public:
PoolingOp(const OperatorDef &op_def, Workspace *ws) PoolingOp(const OperatorDef &op_def, OpKernelContext *context)
: ConvPool2dOpBase<D, T>(op_def, ws), : ConvPool2dOpBase<D, T>(op_def, context),
kernels_(OperatorBase::GetRepeatedArgs<int>("kernels")), kernels_(OperatorBase::GetRepeatedArgs<int>("kernels")),
pooling_type_( pooling_type_(
static_cast<PoolingType>(OperatorBase::GetOptionalArg<int>( static_cast<PoolingType>(OperatorBase::GetOptionalArg<int>(
"pooling_type", static_cast<int>(AVG)))), "pooling_type", static_cast<int>(AVG)))),
functor_(pooling_type_, functor_(context,
pooling_type_,
kernels_.data(), kernels_.data(),
this->strides_.data(), this->strides_.data(),
this->padding_type_, this->padding_type_,
......
...@@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) { ...@@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Check // Check
auto expected = auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); net.CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) { ...@@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8}); auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { ...@@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15}); auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17}); auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() { ...@@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() {
} }
// Check // Check
auto expected = CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26}); auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {20, 22, 24, 26});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC); NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape, ...@@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-4); 1e-4);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} }
} // namespace } // namespace
...@@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) { ...@@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
...@@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() { ...@@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); auto expected = net.CreateTensor<float>({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC); NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<D, T>(&net, "Input", "InputImage", BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector<index_t> &shape, ...@@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_HALF) { if (DataTypeToEnum<T>::value == DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-3, ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-3,
1e-3); 1e-3);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} }
} // namespace } // namespace
...@@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) { ...@@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) {
// Check // Check
auto expected = auto expected =
CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); net.CreateTensor<uint8_t>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) { ...@@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) {
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8}); auto expected = net.CreateTensor<uint8_t>({1, 2, 2, 1}, {4, 5, 7, 8});
ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) { ...@@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) {
net.RunOp(); net.RunOp();
// Check // Check
auto expected = CreateTensor<uint8_t>( auto expected = net.CreateTensor<uint8_t>(
{1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29}); {1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29});
ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<uint8_t>(*expected, *net.GetOutput("Output"), 1e-5);
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ProposalOp : public Operator<D, T> { class ProposalOp : public Operator<D, T> {
public: public:
ProposalOp(const OperatorDef &operator_def, Workspace *ws) ProposalOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetOptionalArg<int>("min_size", 16), functor_(context,
OperatorBase::GetOptionalArg<int>("min_size", 16),
OperatorBase::GetOptionalArg<float>("nms_thresh", 0.7), OperatorBase::GetOptionalArg<float>("nms_thresh", 0.7),
OperatorBase::GetOptionalArg<int>("pre_nms_top_n", 6000), OperatorBase::GetOptionalArg<int>("pre_nms_top_n", 6000),
OperatorBase::GetOptionalArg<int>("post_nms_top_n", 300), OperatorBase::GetOptionalArg<int>("post_nms_top_n", 300),
......
...@@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) { ...@@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) {
// Run // Run
net.RunOp(); net.RunOp();
auto expected_tensor = CreateTensor<float>({1, 1, 1, 5}, {0, 0, 0, 255, 255}); auto expected_tensor = net.CreateTensor<float>({1, 1, 1, 5},
{0, 0, 0, 255, 255});
ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5); ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
} }
......
...@@ -24,8 +24,9 @@ namespace ops { ...@@ -24,8 +24,9 @@ namespace ops {
template<DeviceType D, class T> template<DeviceType D, class T>
class QuantizeOp : public Operator<D, T> { class QuantizeOp : public Operator<D, T> {
public: public:
QuantizeOp(const OperatorDef &operator_def, Workspace *ws) QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(context),
non_zero_( non_zero_(
static_cast<bool>(OperatorBase::GetOptionalArg<int>("non_zero", static_cast<bool>(OperatorBase::GetOptionalArg<int>("non_zero",
0))) {} 0))) {}
...@@ -50,8 +51,8 @@ class QuantizeOp : public Operator<D, T> { ...@@ -50,8 +51,8 @@ class QuantizeOp : public Operator<D, T> {
template<DeviceType D, class T> template<DeviceType D, class T>
class DequantizeOp : public Operator<D, T> { class DequantizeOp : public Operator<D, T> {
public: public:
DequantizeOp(const OperatorDef &operator_def, Workspace *ws) DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, context), functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -27,9 +27,10 @@ namespace ops { ...@@ -27,9 +27,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ReduceMeanOp : public Operator<D, T> { class ReduceMeanOp : public Operator<D, T> {
public: public:
ReduceMeanOp(const OperatorDef &operator_def, Workspace *ws) ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetRepeatedArgs<int>("axis"), functor_(context,
OperatorBase::GetRepeatedArgs<int>("axis"),
OperatorBase::GetOptionalArg<bool>("keepdims", false)) {} OperatorBase::GetOptionalArg<bool>("keepdims", false)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -57,7 +57,7 @@ void Simple(const std::vector<index_t> &input_shape, ...@@ -57,7 +57,7 @@ void Simple(const std::vector<index_t> &input_shape,
ImageToBuffer<D, float>(&net, "OutputImg", "Output", ImageToBuffer<D, float>(&net, "OutputImg", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
auto expected = CreateTensor<float>(output_shape, output); auto expected = net.CreateTensor<float>(output_shape, output);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5, 1e-3);
} }
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ReshapeOp : public Operator<D, T> { class ReshapeOp : public Operator<D, T> {
public: public:
ReshapeOp(const OperatorDef &op_def, Workspace *ws) ReshapeOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, context), functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ResizeBicubicOp : public Operator<D, T> { class ResizeBicubicOp : public Operator<D, T> {
public: public:
ResizeBicubicOp(const OperatorDef &operator_def, Workspace *ws) ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}), functor_(context,
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {} OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { ...@@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
} }
...@@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { ...@@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 2, 3, 3}, auto expected = net.CreateTensor<float>({1, 2, 3, 3},
{0., 1., 2., 4.110297, 5.110297, 6.110297, {0., 1., 2., 4.110297, 5.110297, 6.110297,
8.223037, 9.223036, 10.223037, 24., 25., 26., 8.223037, 9.223036, 10.223037, 24., 25., 26.,
28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038}); 28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038});
...@@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { ...@@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
} }
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class ResizeBilinearOp : public Operator<D, T> { class ResizeBilinearOp : public Operator<D, T> {
public: public:
ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws) ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}), functor_(context,
OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
OperatorBase::GetOptionalArg<bool>("align_corners", false)) {} OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { ...@@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { ...@@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
NHWC); NHWC);
// Check // Check
auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -116,8 +116,8 @@ void TestRandomResizeBilinear() { ...@@ -116,8 +116,8 @@ void TestRandomResizeBilinear() {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC); "Output", NHWC);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
...@@ -136,7 +136,7 @@ void TestRandomResizeBilinear() { ...@@ -136,7 +136,7 @@ void TestRandomResizeBilinear() {
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
} }
// Check // Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 1e-5, ExpectTensorNear<float>(*expected, *net.GetOutput("DeviceOutput"), 1e-5,
1e-6); 1e-6);
} }
} }
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ScalarMathOp : public Operator<D, T> { class ScalarMathOp : public Operator<D, T> {
public: public:
ScalarMathOp(const OperatorDef &op_def, Workspace *ws) ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(static_cast<kernels::EltwiseType>( functor_(context,
static_cast<kernels::EltwiseType>(
OperatorBase::GetOptionalArg<int>( OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))), "type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"), OperatorBase::GetRepeatedArgs<float>("coeff"),
......
...@@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type, ...@@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type,
net.RunOp(D); net.RunOp(D);
auto expected = CreateTensor<DstType>({}, {output}); auto expected = net.CreateTensor<DstType>({}, {output});
ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
TEST_F(ScalarMathOpTest, SimpleCPU) { TEST_F(ScalarMathOpTest, SimpleCPU) {
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::SUM, 1, 2, 3, 3); kernels::EltwiseType::SUM, 1, 2, 3, 3);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::SUB, 1, 2, 3, -1); kernels::EltwiseType::SUB, 1, 2, 3, -1);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::PROD, 3, -2, 3, -6); kernels::EltwiseType::PROD, 3, -2, 3, -6);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::DIV, 3, -2, 1, -1.5); kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::MIN, 3, -2, 1, -2); kernels::EltwiseType::MIN, 3, -2, 1, -2);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::MAX, 3, -2, 1, 3); kernels::EltwiseType::MAX, 3, -2, 1, 3);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::NEG, 3, -2, 1, -3); kernels::EltwiseType::NEG, 3, -2, 1, -3);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::ABS, 3, -2, 1, 3); kernels::EltwiseType::ABS, 3, -2, 1, 3);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
ScalarMathTest<DeviceType::CPU, float, float>( ScalarMathTest<DeviceType::CPU, float, float>(
kernels::EltwiseType::POW, 3, 1, 1, 3); kernels::EltwiseType::POW, 3, 1, 1, 3);
ScalarMathTest<DeviceType::CPU, float, int32_t>( ScalarMathTest<DeviceType::CPU, float, int32_t>(
kernels::EltwiseType::EQUAL, 3, 3, 1, 1); kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
} }
TEST_F(ScalarMathOpTest, SimpleGPU) { TEST_F(ScalarMathOpTest, SimpleGPU) {
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::SUM, 1, 2, 1, 3); kernels::EltwiseType::SUM, 1, 2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::SUB, 1, 2, 1, -1); kernels::EltwiseType::SUB, 1, 2, 1, -1);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::PROD, 3, -2, 1, -6); kernels::EltwiseType::PROD, 3, -2, 1, -6);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::DIV, 3, -2, 1, -1.5); kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::MIN, 3, -2, 1, -2); kernels::EltwiseType::MIN, 3, -2, 1, -2);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::MAX, 3, -2, 1, 3); kernels::EltwiseType::MAX, 3, -2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::NEG, 3, -2, 1, -3); kernels::EltwiseType::NEG, 3, -2, 1, -3);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::ABS, 3, -2, 1, 3); kernels::EltwiseType::ABS, 3, -2, 1, 3);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
ScalarMathTest<DeviceType::GPU, float, float>( ScalarMathTest<DeviceType::GPU, float, float>(
kernels::EltwiseType::POW, 3, 1, 1, 3); kernels::EltwiseType::POW, 3, 1, 1, 3);
ScalarMathTest<DeviceType::GPU, float, int32_t>( ScalarMathTest<DeviceType::GPU, float, int32_t>(
kernels::EltwiseType::EQUAL, 3, 3, 1, 1); kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
} }
} // namespace test } // namespace test
} // namespace ops } // namespace ops
......
...@@ -25,8 +25,8 @@ namespace ops { ...@@ -25,8 +25,8 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class ShapeOp : public Operator<D, T> { class ShapeOp : public Operator<D, T> {
public: public:
ShapeOp(const OperatorDef &op_def, Workspace *ws) ShapeOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws) {} : Operator<D, T>(op_def, context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -24,8 +24,9 @@ namespace ops { ...@@ -24,8 +24,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class SoftmaxOp : public Operator<D, T> { class SoftmaxOp : public Operator<D, T> {
public: public:
SoftmaxOp(const OperatorDef &operator_def, Workspace *ws) SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, context),
functor_(context) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *logits = this->Input(LOGITS); const Tensor *logits = this->Input(LOGITS);
......
...@@ -29,7 +29,7 @@ void Simple() { ...@@ -29,7 +29,7 @@ void Simple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4}); {1, 1, 1, 1, 1, 2, 3, 4});
auto expected = CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 4}, {1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
...@@ -113,8 +113,8 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -113,8 +113,8 @@ void Complex(const std::vector<index_t> &logits_shape) {
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
} }
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -131,7 +131,7 @@ void Complex(const std::vector<index_t> &logits_shape) { ...@@ -131,7 +131,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} // namespace } // namespace
......
...@@ -27,9 +27,10 @@ namespace ops { ...@@ -27,9 +27,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class SpaceToBatchNDOp : public Operator<D, T> { class SpaceToBatchNDOp : public Operator<D, T> {
public: public:
SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}), functor_(context,
OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}), OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1}),
false) {} false) {}
......
...@@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape, ...@@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
const std::vector<int> &padding_data, const std::vector<int> &padding_data,
const std::vector<index_t> &batch_shape, const std::vector<index_t> &batch_shape,
const std::vector<float> &batch_data) { const std::vector<float> &batch_data) {
auto space_tensor = std::unique_ptr<Tensor>( OpsTestNet net;
new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v())); auto space_tensor = net.CreateTensor<T, GPU>();
space_tensor->Resize(space_shape); space_tensor->Resize(space_shape);
{ {
Tensor::MappingGuard space_mapper(space_tensor.get()); Tensor::MappingGuard space_mapper(space_tensor.get());
T *space_ptr = space_tensor->mutable_data<T>(); T *space_ptr = space_tensor->template mutable_data<T>();
MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size()) MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size())
<< "Space tensor size:" << space_tensor->size() << "Space tensor size:" << space_tensor->size()
<< ", space data size:" << space_data.size(); << ", space data size:" << space_data.size();
memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T)); memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
} }
auto batch_tensor = std::unique_ptr<Tensor>( auto batch_tensor = net.CreateTensor<T, GPU>();
new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum<T>::v()));
batch_tensor->Resize(batch_shape); batch_tensor->Resize(batch_shape);
{ {
Tensor::MappingGuard batch_mapper(batch_tensor.get()); Tensor::MappingGuard batch_mapper(batch_tensor.get());
T *batch_ptr = batch_tensor->mutable_data<T>(); T *batch_ptr = batch_tensor->template mutable_data<T>();
MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch_data.size()); MACE_CHECK(static_cast<size_t>(batch_tensor->size()) == batch_data.size());
memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T)); memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T));
} }
......
...@@ -27,9 +27,11 @@ namespace ops { ...@@ -27,9 +27,11 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class SpaceToDepthOp : public Operator<D, T> { class SpaceToDepthOp : public Operator<D, T> {
public: public:
SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws) SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("block_size", 1), false) {} functor_(context,
OperatorBase::GetOptionalArg<int>("block_size", 1),
false) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class SplitOp : public Operator<D, T> { class SplitOp : public Operator<D, T> {
public: public:
SplitOp(const OperatorDef &op_def, Workspace *ws) SplitOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 3)) {} functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
MACE_CHECK(this->OutputSize() >= 2) MACE_CHECK(this->OutputSize() >= 2)
......
...@@ -26,8 +26,8 @@ namespace ops { ...@@ -26,8 +26,8 @@ namespace ops {
template<DeviceType D, typename T> template<DeviceType D, typename T>
class SqueezeOp : public Operator<D, T> { class SqueezeOp : public Operator<D, T> {
public: public:
SqueezeOp(const OperatorDef &op_def, Workspace *ws) SqueezeOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
axis_(OperatorBase::GetRepeatedArgs<int>("axis", {})) {} axis_(OperatorBase::GetRepeatedArgs<int>("axis", {})) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class StackOp : public Operator<D, T> { class StackOp : public Operator<D, T> {
public: public:
StackOp(const OperatorDef &operator_def, Workspace *ws) StackOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {} functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const std::vector<const Tensor *> &inputs = this->Inputs(); const std::vector<const Tensor *> &inputs = this->Inputs();
......
...@@ -24,9 +24,10 @@ namespace ops { ...@@ -24,9 +24,10 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class StridedSliceOp : public Operator<D, T> { class StridedSliceOp : public Operator<D, T> {
public: public:
StridedSliceOp(const OperatorDef &operator_def, Workspace *ws) StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetOptionalArg<int>("begin_mask", 0), functor_(context,
OperatorBase::GetOptionalArg<int>("begin_mask", 0),
OperatorBase::GetOptionalArg<int>("end_mask", 0), OperatorBase::GetOptionalArg<int>("end_mask", 0),
OperatorBase::GetOptionalArg<int>("ellipsis_mask", 0), OperatorBase::GetOptionalArg<int>("ellipsis_mask", 0),
OperatorBase::GetOptionalArg<int>("new_axis_mask", 0), OperatorBase::GetOptionalArg<int>("new_axis_mask", 0),
......
...@@ -26,10 +26,10 @@ namespace mace { ...@@ -26,10 +26,10 @@ namespace mace {
template <DeviceType D, class T> template <DeviceType D, class T>
class TransposeOp : public Operator<D, T> { class TransposeOp : public Operator<D, T> {
public: public:
TransposeOp(const OperatorDef &operator_def, Workspace *ws) TransposeOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
dims_(OperatorBase::GetRepeatedArgs<int>("dims")), dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
functor_(dims_) {} functor_(context, dims_) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -26,9 +26,9 @@ namespace ops { ...@@ -26,9 +26,9 @@ namespace ops {
template <DeviceType D, class T> template <DeviceType D, class T>
class UnstackOp : public Operator<D, T> { class UnstackOp : public Operator<D, T> {
public: public:
UnstackOp(const OperatorDef &operator_def, Workspace *ws) UnstackOp(const OperatorDef &operator_def, OpKernelContext *context)
: Operator<D, T>(operator_def, ws), : Operator<D, T>(operator_def, context),
functor_(OperatorBase::GetOptionalArg<int>("axis", 0)) {} functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch, ...@@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch,
// Transfer output // Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput", ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected;
expected.Copy(*net.GetOutput("ConvOutput")); auto expected = net.CreateTensor<float>();
auto output_shape = expected.shape(); expected->Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected->shape();
// Winograd convolution // Winograd convolution
// transform filter // transform filter
...@@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch, ...@@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput", ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
1e-2, 1e-2);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
1e-5, 1e-4);
} }
} }
} // namespace } // namespace
...@@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch,
// Transfer output // Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput", ImageToBuffer<D, float>(&net, "OutputImage", "ConvOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
Tensor expected; auto expected = net.CreateTensor<float>();
expected.Copy(*net.GetOutput("ConvOutput")); expected->Copy(*net.GetOutput("ConvOutput"));
auto output_shape = expected.shape(); auto output_shape = expected->shape();
// Winograd convolution // Winograd convolution
// transform filter // transform filter
...@@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch, ...@@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch,
ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput", ImageToBuffer<D, float>(&net, "WinoOutputImage", "WinoOutput",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DataType::DT_HALF) { if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
1e-2, 1e-2);
} else { } else {
ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); ExpectTensorNear<float>(*expected, *net.GetOutput("WinoOutput"),
1e-5, 1e-4);
} }
} }
} // namespace } // namespace
......
...@@ -29,9 +29,11 @@ namespace ops { ...@@ -29,9 +29,11 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class WinogradInverseTransformOp : public Operator<D, T> { class WinogradInverseTransformOp : public Operator<D, T> {
public: public:
WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws) WinogradInverseTransformOp(const OperatorDef &op_def,
: Operator<D, T>(op_def, ws), OpKernelContext *context)
functor_(kernels::StringToActivationType( : Operator<D, T>(op_def, context),
functor_(context,
kernels::StringToActivationType(
OperatorBase::GetOptionalArg<std::string>("activation", OperatorBase::GetOptionalArg<std::string>("activation",
"NOOP")), "NOOP")),
OperatorBase::GetOptionalArg<float>("max_limit", 0.0f), OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
......
...@@ -26,9 +26,10 @@ namespace ops { ...@@ -26,9 +26,10 @@ namespace ops {
template <DeviceType D, typename T> template <DeviceType D, typename T>
class WinogradTransformOp : public Operator<D, T> { class WinogradTransformOp : public Operator<D, T> {
public: public:
WinogradTransformOp(const OperatorDef &op_def, Workspace *ws) WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context)
: Operator<D, T>(op_def, ws), : Operator<D, T>(op_def, context),
functor_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>( functor_(context,
static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
"padding", static_cast<int>(VALID))), "padding", static_cast<int>(VALID))),
OperatorBase::GetRepeatedArgs<int>("padding_values"), OperatorBase::GetRepeatedArgs<int>("padding_values"),
OperatorBase::GetOptionalArg<int>( OperatorBase::GetOptionalArg<int>(
......
...@@ -11,7 +11,6 @@ cc_library( ...@@ -11,7 +11,6 @@ cc_library(
name = "public", name = "public",
hdrs = [ hdrs = [
"mace.h", "mace.h",
"mace_runtime.h",
], ],
copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
) )
...@@ -24,12 +24,36 @@ ...@@ -24,12 +24,36 @@
#include <string> #include <string>
#include <vector> #include <vector>
#ifndef MACE_API
#define MACE_API __attribute__((visibility("default")))
#endif
namespace mace { namespace mace {
class NetDef; class NetDef;
enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 };
enum GPUPerfHint {
PERF_DEFAULT = 0,
PERF_LOW = 1,
PERF_NORMAL = 2,
PERF_HIGH = 3
};
enum GPUPriorityHint {
PRIORITY_DEFAULT = 0,
PRIORITY_LOW = 1,
PRIORITY_NORMAL = 2,
PRIORITY_HIGH = 3
};
enum CPUAffinityPolicy {
AFFINITY_NONE = 0,
AFFINITY_BIG_ONLY = 1,
AFFINITY_LITTLE_ONLY = 2,
};
struct CallStats { struct CallStats {
int64_t start_micros; int64_t start_micros;
int64_t end_micros; int64_t end_micros;
...@@ -73,14 +97,167 @@ enum MaceStatus { ...@@ -73,14 +97,167 @@ enum MaceStatus {
} \ } \
} }
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
/// \brief GPU context contain the status used for GPU device.
///
/// The life cycle of GPUContext object is the same as MaceEngines use it.
/// Just use one GPUContext for all MaceEngines, which will speed up the
/// initialization procedure. There are some data in common between different
/// MaceEngines using GPU, use one GPUContext could avoid duplication.
class GPUContext;
/// \brief GPUContext builder.
///
/// Use the GPUContextBuilder to generate GPUContext.
/// Not thread-safe
class MACE_API GPUContextBuilder {
public:
GPUContextBuilder();
~GPUContextBuilder();
GPUContextBuilder(const GPUContextBuilder &) = delete;
GPUContextBuilder(const GPUContextBuilder &&) = delete;
GPUContextBuilder &operator=(const GPUContextBuilder &) = delete;
GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete;
/// \brief Set internal storage factory to store internal data.
///
/// Now the path is used to store the built OpenCL binaries to file,
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
GPUContextBuilder &SetStoragePath(const std::string &path);
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
///
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
/// OpenCL binary is corresponding to the OpenCL Driver version,
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
GPUContextBuilder &SetOpenCLBinaryPaths(
const std::vector<std::string> &paths);
/// \brief Set the path of Generated OpenCL parameter file
///
/// If you use gpu for specific soc, The parameters is the local work group
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
GPUContextBuilder &SetOpenCLParameterPath(const std::string &path);
std::shared_ptr<GPUContext> Finalize();
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
class MACE_API MaceEngineConfig {
public:
explicit MaceEngineConfig(const DeviceType device_type);
~MaceEngineConfig();
MaceEngineConfig(const MaceEngineConfig &) = delete;
MaceEngineConfig(const MaceEngineConfig &&) = delete;
MaceEngineConfig &operator=(const MaceEngineConfig &) = delete;
MaceEngineConfig &operator=(const MaceEngineConfig &&) = delete;
/// \brief Set GPUContext
///
/// Just use one GPUContext for multiple models run on GPU.
/// \param context created use GPUContextBuilder
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
/// \brief Set GPU hints, currently only supports Adreno GPU.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetGPUHints(GPUPerfHint perf_hint,
GPUPriorityHint priority_hint);
/// \brief Set CPU threads number and affinity policy.
///
/// Caution: this function may hurt performance if improper
/// parameters provided. When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \param status MACE_SUCCESS for successful, or it can't reliabley
/// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
/// suggested to use AFFINITY_NONE to use all cores.
/// \param use_gemmlowp use gemmlowp for quantized inference
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetCPUThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids);
DeviceType device_type() const;
int num_threads() const;
std::shared_ptr<GPUContext> gpu_context() const;
GPUPriorityHint gpu_priority_hint() const;
GPUPerfHint gpu_perf_hint() const;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
// MACE input/output tensor // MACE input/output tensor
class __attribute__((visibility("default"))) MaceTensor { class MACE_API MaceTensor {
public: public:
// shape - the shape of the tensor, with size n // shape - the shape of the tensor, with size n
// data - the buffer of the tensor, must not be null with size equals // data - the buffer of the tensor, must not be null with size equals
// shape[0] * shape[1] * ... * shape[n-1] // shape[0] * shape[1] * ... * shape[n-1]
explicit MaceTensor(const std::vector<int64_t> &shape, MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data); std::shared_ptr<float> data);
MaceTensor(); MaceTensor();
MaceTensor(const MaceTensor &other); MaceTensor(const MaceTensor &other);
MaceTensor(const MaceTensor &&other); MaceTensor(const MaceTensor &&other);
...@@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor { ...@@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor {
std::unique_ptr<Impl> impl_; std::unique_ptr<Impl> impl_;
}; };
class __attribute__((visibility("default"))) MaceEngine { class MACE_API MaceEngine {
public: public:
explicit MaceEngine(DeviceType device_type); explicit MaceEngine(const MaceEngineConfig &config);
~MaceEngine(); ~MaceEngine();
MaceStatus Init(const NetDef *net_def, MaceStatus Init(const NetDef *net_def,
...@@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine { ...@@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine {
/// \param model_data_file[in]: the path of model data file /// \param model_data_file[in]: the path of model data file
/// \param input_nodes[in]: the array of input nodes' name /// \param input_nodes[in]: the array of input nodes' name
/// \param output_nodes[in]: the array of output nodes' name /// \param output_nodes[in]: the array of output nodes' name
/// \param device_type[in]: one of [CPU, GPU, HEXAGON], /// \param config[in]: configurations for MaceEngine.
/// based on the runtime type of your model deployment file.
/// \param engine[out]: output MaceEngine object /// \param engine[out]: output MaceEngine object
/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
/// MACE_OUT_OF_RESOURCES for resources is out of range. /// MACE_OUT_OF_RESOURCES for resources is out of range.
__attribute__((visibility("default"))) MACE_API MaceStatus CreateMaceEngineFromProto(
MaceStatus CreateMaceEngineFromProto(
const std::vector<unsigned char> &model_pb, const std::vector<unsigned char> &model_pb,
const std::string &model_data_file, const std::string &model_data_file,
const std::vector<std::string> &input_nodes, const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes, const std::vector<std::string> &output_nodes,
const DeviceType device_type, const MaceEngineConfig &config,
std::shared_ptr<MaceEngine> *engine); std::shared_ptr<MaceEngine> *engine);
} // namespace mace } // namespace mace
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file defines runtime tuning APIs.
// These APIs are not stable.
#ifndef MACE_PUBLIC_MACE_RUNTIME_H_
#define MACE_PUBLIC_MACE_RUNTIME_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "mace/public/mace.h"
namespace mace {
enum GPUPerfHint {
PERF_DEFAULT = 0,
PERF_LOW = 1,
PERF_NORMAL = 2,
PERF_HIGH = 3
};
enum GPUPriorityHint {
PRIORITY_DEFAULT = 0,
PRIORITY_LOW = 1,
PRIORITY_NORMAL = 2,
PRIORITY_HIGH = 3
};
enum CPUAffinityPolicy {
AFFINITY_NONE = 0,
AFFINITY_BIG_ONLY = 1,
AFFINITY_LITTLE_ONLY = 2,
};
class KVStorage {
public:
// return: 0 for success, -1 for error
virtual int Load() = 0;
virtual void Clear() = 0;
virtual bool Insert(const std::string &key,
const std::vector<unsigned char> &value) = 0;
virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
// return: 0 for success, -1 for error
virtual int Flush() = 0;
virtual ~KVStorage() {}
};
class KVStorageFactory {
public:
virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
};
class __attribute__((visibility("default"))) FileStorageFactory
: public KVStorageFactory {
public:
// You have to make sure your APP have read and write permission of the path.
explicit FileStorageFactory(const std::string &path);
~FileStorageFactory();
std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
private:
class Impl;
std::unique_ptr<Impl> impl_;
};
/// \brief Set internal storage factory to store internal data. (Call once)
///
/// Now the path is used to store the built OpenCL binaries to file,
/// which could speed up the GPU initialization and first run.
/// If do not call this API, the initialization maybe slow for GPU.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default")))
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length)
///
/// Just call once. (Not thread-safe)
/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length)
/// OpenCL binary is corresponding to the OpenCL Driver version,
/// you should update the binary when OpenCL Driver changed.
///
/// \param paths MACE will use first file found in all paths
/// \return
__attribute__((visibility("default")))
void SetOpenCLBinaryPaths(const std::vector<std::string> &paths);
/// \brief Set the path of Generated OpenCL parameter file
///
/// Just call once. (Not thread-safe)
/// If you use gpu for specific soc, The parameters is the local work group
/// size tuned for specific SOC, which may be faster than the
/// general parameters.
///
/// \param path Make sure your program have Read/Write permission of the path
/// \return
__attribute__((visibility("default")))
void SetOpenCLParameterPath(const std::string &path);
/// \brief Set GPU hints, currently only supports Adreno GPU.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
///
/// \param perf_hint performance hint
/// \param priority_hint priority hint
/// \return
__attribute__((visibility("default")))
void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
/// \brief Set OpenMP threads number and affinity policy.
///
/// Caution: this function may hurt performance if improper parameters provided.
/// When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for
/// quantized inference.
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \param use_gemmlowp use gemmlowp for quantized inference
/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
/// AFFINITY_NONE to use all cores.
__attribute__((visibility("default")))
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \return
__attribute__((visibility("default")))
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids);
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
__attribute__((visibility("default")))
MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
} // namespace mace
#endif // MACE_PUBLIC_MACE_RUNTIME_H_
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
#include <vector> #include <vector>
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
namespace mace { namespace mace {
...@@ -57,8 +56,7 @@ std::map<std::string, int> model_name_map { ...@@ -57,8 +56,7 @@ std::map<std::string, int> model_name_map {
/// if model_data_format is code, just pass empty string("") /// if model_data_format is code, just pass empty string("")
/// \param input_nodes[in]: the array of input nodes' name /// \param input_nodes[in]: the array of input nodes' name
/// \param output_nodes[in]: the array of output nodes' name /// \param output_nodes[in]: the array of output nodes' name
/// \param device_type[in]: one of [CPU, GPU, HEXAGON], /// \param config[in]: configurations for MaceEngine.
/// based on the runtime type of your model deployment file.
/// \param engine[out]: output MaceEngine object /// \param engine[out]: output MaceEngine object
/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
/// MACE_OUT_OF_RESOURCES for resources is out of range. /// MACE_OUT_OF_RESOURCES for resources is out of range.
...@@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode( ...@@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode(
const std::string &model_data_file, const std::string &model_data_file,
const std::vector<std::string> &input_nodes, const std::vector<std::string> &input_nodes,
const std::vector<std::string> &output_nodes, const std::vector<std::string> &output_nodes,
const DeviceType device_type, const MaceEngineConfig &config,
std::shared_ptr<MaceEngine> *engine) { std::shared_ptr<MaceEngine> *engine) {
// load model // load model
if (engine == nullptr) { if (engine == nullptr) {
...@@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode( ...@@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode(
{% for i in range(model_tags |length) %} {% for i in range(model_tags |length) %}
case {{ i }}: case {{ i }}:
net_def = mace::{{model_tags[i]}}::CreateNet(); net_def = mace::{{model_tags[i]}}::CreateNet();
engine->reset(new mace::MaceEngine(device_type)); engine->reset(new mace::MaceEngine(config));
{% if embed_model_data %} {% if embed_model_data %}
model_data = mace::{{model_tags[i]}}::LoadModelData(); model_data = mace::{{model_tags[i]}}::LoadModelData();
status = (*engine)->Init(net_def.get(), input_nodes, output_nodes, status = (*engine)->Init(net_def.get(), input_nodes, output_nodes,
......
# Description:
# Mace operators.
#
package( package(
default_visibility = ["//visibility:public"], default_visibility = ["//visibility:public"],
) )
......
...@@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { ...@@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
input_names.push_back(MakeString("input", 0)); input_names.push_back(MakeString("input", 0));
output_names.push_back(MakeString("output", 0)); output_names.push_back(MakeString("output", 0));
const DeviceType device = DeviceType::GPU; MaceEngineConfig config(DeviceType::GPU);
config.SetGPUContext(
ops::test::OpTestContext::Get()->gpu_context());
std::shared_ptr<NetDef> net_def(new NetDef()); std::shared_ptr<NetDef> net_def(new NetDef());
for (size_t i = 0; i < input_names.size(); ++i) { for (size_t i = 0; i < input_names.size(); ++i) {
...@@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { ...@@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) {
info->set_name(input_names[i]); info->set_name(input_names[i]);
} }
MaceEngine engine(device); MaceEngine engine(config);
ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr), ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr),
""); "");
} }
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/public/mace_runtime.h"
namespace mace { namespace mace {
namespace test { namespace test {
...@@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def, ...@@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def,
for (auto output : outputs) { for (auto output : outputs) {
std::unique_ptr<Tensor> tmp_tensor( std::unique_ptr<Tensor> tmp_tensor(
new Tensor(GetDeviceAllocator(DeviceType::CPU), new Tensor(GetCPUAllocator(),
DataTypeToEnum<float>::v())); DataTypeToEnum<float>::v()));
auto output_shape = output.second.shape(); auto output_shape = output.second.shape();
const int64_t data_size = std::accumulate(output_shape.begin(), const int64_t data_size = std::accumulate(output_shape.begin(),
...@@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) { ...@@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) {
OutputInfo *info = net_def->add_output_info(); OutputInfo *info = net_def->add_output_info();
info->set_name(output_names[i]); info->set_name(output_names[i]);
} }
MaceEngineConfig config(DeviceType::GPU);
const std::string file_path ="/data/local/tmp/mace"; MaceEngine engine(config);
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(file_path));
mace::SetKVStorageFactory(storage_factory);
MaceEngine engine(device);
MaceStatus status = engine.Init(net_def.get(), input_names, output_names, MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
reinterpret_cast<unsigned char *>(data.data())); reinterpret_cast<unsigned char *>(data.data()));
EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);
...@@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) { ...@@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) {
const int thread_num = 10; const int thread_num = 10;
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int i = 0; i < thread_num; ++i) { for (int i = 0; i < thread_num; ++i) {
threads.push_back(std::thread(MaceRunFunc, i)); threads.push_back(std::thread(MaceRunFunc, 1));
} }
for (auto &t : threads) { for (auto &t : threads) {
t.join(); t.join();
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/public/mace_runtime.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace test { namespace test {
...@@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def, ...@@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def,
} }
net.RunNet(net_def, D); net.RunNet(net_def, D);
std::unique_ptr<Allocator> allocator(new CPUAllocator);
for (auto output : outputs) { for (auto output : outputs) {
std::unique_ptr<Tensor> tmp_tensor( std::unique_ptr<Tensor> tmp_tensor(
new Tensor(GetDeviceAllocator(DeviceType::CPU), new Tensor(allocator.get(),
DataTypeToEnum<float>::v())); DataTypeToEnum<float>::v()));
auto output_shape = output.second.shape(); auto output_shape = output.second.shape();
const int64_t data_size = std::accumulate(output_shape.begin(), const int64_t data_size = std::accumulate(output_shape.begin(),
...@@ -333,7 +334,9 @@ void MaceRun(const int in_out_size, ...@@ -333,7 +334,9 @@ void MaceRun(const int in_out_size,
info->set_name(output_names[i]); info->set_name(output_names[i]);
} }
MaceEngine engine(device); MaceEngineConfig config(DeviceType::GPU);
MaceEngine engine(config);
MaceStatus status = engine.Init(net_def.get(), input_names, output_names, MaceStatus status = engine.Init(net_def.get(), input_names, output_names,
reinterpret_cast<unsigned char *>(data.data())); reinterpret_cast<unsigned char *>(data.data()));
EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS);
......
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/env_time.h" #include "mace/utils/env_time.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name, ...@@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name,
const std::vector<std::vector<int64_t>> &input_shapes, const std::vector<std::vector<int64_t>> &input_shapes,
const std::vector<std::string> &output_names, const std::vector<std::string> &output_names,
const std::vector<std::vector<int64_t>> &output_shapes) { const std::vector<std::vector<int64_t>> &output_shapes) {
MACE_RETURN_IF_ERROR(mace::SetOpenMPThreadPolicy( // config runtime
FLAGS_omp_num_threads, CPUAffinityPolicy::AFFINITY_NONE)); MaceStatus status;
MaceEngineConfig config(DeviceType::CPU);
status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads,
CPUAffinityPolicy::AFFINITY_NONE);
if (status != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
std::vector<unsigned char> model_pb_data; std::vector<unsigned char> model_pb_data;
if (FLAGS_model_file != "") { if (FLAGS_model_file != "") {
...@@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name, ...@@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
DeviceType::CPU, config,
&engine)); &engine));
#else #else
(void) (model_name); (void) (model_name);
...@@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name, ...@@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
DeviceType::CPU, config,
&engine)); &engine));
#endif #endif
......
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h"
#include "mace/utils/env_time.h" #include "mace/utils/env_time.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name, ...@@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name,
const std::vector<std::vector<int64_t>> &output_shapes) { const std::vector<std::vector<int64_t>> &output_shapes) {
DeviceType device_type = ParseDeviceType(FLAGS_device); DeviceType device_type = ParseDeviceType(FLAGS_device);
// config runtime // config runtime
MaceStatus status = mace::SetOpenMPThreadPolicy( MaceStatus status;
FLAGS_omp_num_threads, MaceEngineConfig config(device_type);
static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy), status = config.SetCPUThreadPolicy(
true); FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
true);
if (status != MACE_SUCCESS) { if (status != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set openmp or cpu affinity failed.";
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) { if (device_type == DeviceType::GPU) {
mace::SetGPUHints( const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint), const std::string storage_path =
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint)); std::string(storage_path_ptr == nullptr ?
"/data/local/tmp/mace_run/interior" : storage_path_ptr);
std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file}; std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
mace::SetOpenCLBinaryPaths(opencl_binary_paths);
mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); gpu_context = GPUContextBuilder()
.SetStoragePath(storage_path)
.SetOpenCLBinaryPaths(opencl_binary_paths)
.SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
.Finalize();
config.SetGPUContext(gpu_context);
config.SetGPUHints(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} }
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string kernel_file_path =
std::string(kernel_path == nullptr ?
"/data/local/tmp/mace_run/interior" : kernel_path);
std::shared_ptr<KVStorageFactory> storage_factory(
new FileStorageFactory(kernel_file_path));
SetKVStorageFactory(storage_factory);
std::vector<unsigned char> model_pb_data; std::vector<unsigned char> model_pb_data;
if (FLAGS_model_file != "") { if (FLAGS_model_file != "") {
if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) { if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) {
...@@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name, ...@@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#else #else
(void)(model_name); (void)(model_name);
...@@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name, ...@@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#endif #endif
int64_t t1 = NowMicros(); int64_t t1 = NowMicros();
...@@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name, ...@@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#else #else
create_engine_status = create_engine_status =
...@@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name, ...@@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#endif #endif
} while (create_engine_status != MACE_SUCCESS); } while (create_engine_status != MACE_SUCCESS);
...@@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name, ...@@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#else #else
create_engine_status = create_engine_status =
...@@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name, ...@@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name,
FLAGS_model_data_file, FLAGS_model_data_file,
input_names, input_names,
output_names, output_names,
device_type, config,
&engine); &engine);
#endif #endif
} while (create_engine_status != MACE_SUCCESS); } while (create_engine_status != MACE_SUCCESS);
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#ifndef MACE_UTILS_TUNER_H_ #ifndef MACE_UTILS_TUNER_H_
#define MACE_UTILS_TUNER_H_ #define MACE_UTILS_TUNER_H_
#include <stdlib.h> #include <stdlib.h>
#include <cstring>
#include <fstream> #include <fstream>
#include <functional> #include <functional>
#include <limits> #include <limits>
...@@ -29,18 +31,24 @@ ...@@ -29,18 +31,24 @@
namespace mace { namespace mace {
inline bool IsTuning() {
const char *tuning = getenv("MACE_TUNING");
return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
}
template <typename param_type> template <typename param_type>
class Tuner { class Tuner {
public: public:
static Tuner *Get() { explicit Tuner(const std::string tuned_param_file_path = ""):
static Tuner tuner; tuned_param_file_path_(tuned_param_file_path) {
return &tuner; path_ = getenv("MACE_RUN_PARAMETER_PATH");
ReadRunParamters();
} }
inline bool IsTuning() { ~Tuner() { WriteRunParameters(); }
const char *tuning = getenv("MACE_TUNING");
return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; Tuner(const Tuner &) = delete;
} Tuner &operator=(const Tuner &) = delete;
template <typename RetType> template <typename RetType>
RetType TuneOrRun( RetType TuneOrRun(
...@@ -76,16 +84,6 @@ class Tuner { ...@@ -76,16 +84,6 @@ class Tuner {
} }
private: private:
Tuner() {
path_ = getenv("MACE_RUN_PARAMETER_PATH");
ReadRunParamters();
}
~Tuner() { WriteRunParameters(); }
Tuner(const Tuner &) = delete;
Tuner &operator=(const Tuner &) = delete;
inline void WriteRunParameters() { inline void WriteRunParameters() {
if (path_ != nullptr) { if (path_ != nullptr) {
VLOG(3) << "Write tuning result to " << path_; VLOG(3) << "Write tuning result to " << path_;
...@@ -117,9 +115,9 @@ class Tuner { ...@@ -117,9 +115,9 @@ class Tuner {
} }
inline void ReadRunParamters() { inline void ReadRunParamters() {
extern std::string kOpenCLParameterPath; if (!tuned_param_file_path_.empty()) {
if (!kOpenCLParameterPath.empty()) { std::ifstream ifs(tuned_param_file_path_,
std::ifstream ifs(kOpenCLParameterPath, std::ios::binary | std::ios::in); std::ios::binary | std::ios::in);
if (ifs.is_open()) { if (ifs.is_open()) {
int64_t num_params = 0; int64_t num_params = 0;
ifs.read(reinterpret_cast<char *>(&num_params), sizeof(num_params)); ifs.read(reinterpret_cast<char *>(&num_params), sizeof(num_params));
...@@ -144,7 +142,7 @@ class Tuner { ...@@ -144,7 +142,7 @@ class Tuner {
LOG(WARNING) << "Read OpenCL tuned parameters file failed."; LOG(WARNING) << "Read OpenCL tuned parameters file failed.";
} }
} else { } else {
LOG(INFO) << "There is no tuned parameters."; VLOG(1) << "There is no tuned parameters.";
} }
} }
...@@ -207,6 +205,7 @@ class Tuner { ...@@ -207,6 +205,7 @@ class Tuner {
} }
private: private:
std::string tuned_param_file_path_;
const char *path_; const char *path_;
std::unordered_map<std::string, std::vector<param_type>> param_table_; std::unordered_map<std::string, std::vector<param_type>> param_table_;
}; };
......
...@@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) { ...@@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) {
} }
}; };
Tuner<unsigned int> tuner;
WallClockTimer timer; WallClockTimer timer;
std::vector<unsigned int> default_params(1, 1); std::vector<unsigned int> default_params(1, 1);
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>( int res = tuner.TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer); "SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
default_params[0] = 2; default_params[0] = 2;
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>( res = tuner.TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer); "SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect + 1, res); EXPECT_EQ(expect + 1, res);
} }
...@@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) { ...@@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) {
return {{1}, {2}, {3}, {4}}; return {{1}, {2}, {3}, {4}};
}; };
// tune // tune
Tuner<unsigned int> tuner;
WallClockTimer timer; WallClockTimer timer;
int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>( int res = tuner.TuneOrRun<unsigned int>(
"SimpleRun", default_params, *params_generator, TunerFunc, &timer); "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
// run // run
res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>( res = tuner.template TuneOrRun<unsigned int>(
"SimpleRun", default_params, nullptr, TunerFunc, &timer); "SimpleRun", default_params, nullptr, TunerFunc, &timer);
EXPECT_EQ(expect, res); EXPECT_EQ(expect, res);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册