diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 9a689f45f9c2ad63bc1711a28b81b1e39b56cef8..26fb2d0b0e4e17355efc9122c00a25147ffe00ba 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -22,7 +22,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" #include "mace/benchmark/statistics.h" @@ -257,36 +256,40 @@ int Main(int argc, char **argv) { mace::DeviceType device_type = ParseDeviceType(FLAGS_device); - // config runtime - MaceStatus ret = mace::SetOpenMPThreadPolicy( + // configuration + MaceStatus mace_status; + MaceEngineConfig config(device_type); + mace_status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), + static_cast(FLAGS_cpu_affinity_policy), true); - if (ret != MACE_SUCCESS) { - LOG(WARNING) << "Set openmp or cpu affinity failed."; + if (mace_status != MACE_SUCCESS) { + LOG(INFO) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH"); - const std::string kernel_file_path = - std::string(kernel_path == nullptr ? - "/data/local/tmp/mace_run/interior" : kernel_path); - - std::shared_ptr storage_factory( - new FileStorageFactory(kernel_file_path)); - SetKVStorageFactory(storage_factory); - // Create Engine std::shared_ptr engine; MaceStatus create_engine_status; @@ -306,7 +309,7 @@ int Main(int argc, char **argv) { model_data_file_ptr, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -314,7 +317,7 @@ int Main(int argc, char **argv) { model_data_file_ptr, input_names, output_names, - device_type, + config, &engine); #endif if (create_engine_status != MaceStatus::MACE_SUCCESS) { diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc index 07776bc12fbcf6fd9db34577d8a0ea63a766f865..d9b5c3c226049a43a43d9a22feee04ad4a9f5add 100644 --- a/mace/core/allocator.cc +++ b/mace/core/allocator.cc @@ -13,30 +13,12 @@ // limitations under the License. #include "mace/core/allocator.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_allocator.h" -#endif namespace mace { -std::map *gAllocatorRegistry() { - static std::map g_allocator_registry; - return &g_allocator_registry; +Allocator *GetCPUAllocator() { + static CPUAllocator allocator; + return &allocator; } -Allocator *GetDeviceAllocator(DeviceType type) { - auto iter = gAllocatorRegistry()->find(type); - if (iter == gAllocatorRegistry()->end()) { - LOG(ERROR) << "Allocator not found for device " << type; - return nullptr; - } - return iter->second; -} - -MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator()); -#ifdef MACE_ENABLE_OPENCL -MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator()); -#endif -MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator()); - } // namespace mace diff --git a/mace/core/allocator.h b/mace/core/allocator.h index a212e7f91434e13c6d4dd101bab16ce855153842..51f04741ca9b2d8d729673c14162024a1d9390d5 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -26,8 +26,6 @@ #include "mace/core/registry.h" #include "mace/core/types.h" #include "mace/core/runtime_failure_mock.h" -#include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -138,26 +136,8 @@ class CPUAllocator : public Allocator { bool OnHost() const override { return true; } }; -std::map *gAllocatorRegistry(); - -Allocator *GetDeviceAllocator(DeviceType type); - -struct AllocatorRegisterer { - explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) { - if (gAllocatorRegistry()->count(type)) { - LOG(ERROR) << "Allocator for device type " << type - << " registered twice. This should not happen." - << gAllocatorRegistry()->count(type); - std::exit(1); - } - gAllocatorRegistry()->emplace(type, alloc); - } -}; - -#define MACE_REGISTER_ALLOCATOR(type, alloc) \ - namespace { \ - static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \ - } +// Global CPU allocator used for CPU/GPU/DSP +Allocator *GetCPUAllocator(); } // namespace mace diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h index 3e1cca9323001359207f3971803fbbc017bf95b5..50ec4eade9c05eb12d0b555595a665e590a14965 100644 --- a/mace/core/arg_helper.h +++ b/mace/core/arg_helper.h @@ -20,7 +20,6 @@ #include #include "mace/proto/mace.pb.h" -#include "mace/public/mace.h" namespace mace { diff --git a/mace/core/buffer.h b/mace/core/buffer.h index b349cf4b4de46a39c51822d880e5132944ff1a22..c57a1714aa91e469e5e2d6ec6de392f8ca868821 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -218,9 +218,9 @@ class Buffer : public BufferBase { class Image : public BufferBase { public: - Image() + explicit Image(Allocator *allocator) : BufferBase(0), - allocator_(GetDeviceAllocator(GPU)), + allocator_(allocator), buf_(nullptr), mapped_buf_(nullptr) {} diff --git a/mace/core/device.cc b/mace/core/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..09f5a068b934b535347a8992b01f162466f0b4c6 --- /dev/null +++ b/mace/core/device.cc @@ -0,0 +1,42 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/device.h" + +namespace mace { + +CPUDevice::CPUDevice(const int num_threads) + : cpu_runtime_(new CPURuntime(num_threads)) {} + +CPUDevice::~CPUDevice() = default; + +CPURuntime *CPUDevice::cpu_runtime() { + return cpu_runtime_.get(); +} + +#ifdef MACE_ENABLE_OPENCL +OpenCLRuntime *CPUDevice::opencl_runtime() { + return nullptr; +} +#endif + +Allocator *CPUDevice::allocator() { + return GetCPUAllocator(); +} + +DeviceType CPUDevice::device_type() const { + return DeviceType::CPU; +} + +} // namespace mace diff --git a/mace/core/device.h b/mace/core/device.h new file mode 100644 index 0000000000000000000000000000000000000000..7336d79f8cb597005bb6c9021f320396ec9e48f0 --- /dev/null +++ b/mace/core/device.h @@ -0,0 +1,60 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_DEVICE_H_ +#define MACE_CORE_DEVICE_H_ + +#include + +#include "mace/core/runtime/cpu/cpu_runtime.h" +#include "mace/core/allocator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_runtime.h" +#endif + +namespace mace { + +class Device { + public: + virtual ~Device() {} + +#ifdef MACE_ENABLE_OPENCL + virtual OpenCLRuntime *opencl_runtime() = 0; +#endif + virtual CPURuntime *cpu_runtime() = 0; + + virtual Allocator *allocator() = 0; + virtual DeviceType device_type() const = 0; +}; + +class CPUDevice : public Device { + public: + explicit CPUDevice(const int num_threads); + virtual ~CPUDevice(); + +#ifdef MACE_ENABLE_OPENCL + OpenCLRuntime *opencl_runtime() override; +#endif + CPURuntime *cpu_runtime() override; + + Allocator *allocator() override; + DeviceType device_type() const override; + + private: + std::unique_ptr cpu_runtime_; +}; + +} // namespace mace +#endif // MACE_CORE_DEVICE_H_ diff --git a/mace/core/device_context.cc b/mace/core/device_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..88a965fa2635da79dda3f3158b084c8ec8f41b11 --- /dev/null +++ b/mace/core/device_context.cc @@ -0,0 +1,73 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/device_context.h" + +#include + +namespace mace { + +namespace { + +const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin"; + +std::string FindFirstExistPath(const std::vector &paths) { + std::string result; + struct stat st; + for (auto path : paths) { + if (stat(path.c_str(), &st) == 0) { + if (S_ISREG(st.st_mode)) { + result = path; + break; + } + } + } + return result; +} +} // namespace + +GPUContext::GPUContext(const std::string &storage_path, + const std::vector &opencl_binary_paths, + const std::string &opencl_parameter_path) + : storage_factory_(new FileStorageFactory(storage_path)), + opencl_tuner_(new Tuner(opencl_parameter_path)) { + + if (!storage_path.empty()) { + opencl_cache_storage_ = + storage_factory_->CreateStorage(kPrecompiledProgramFileName); + } + + std::string precompiled_binary_path = + FindFirstExistPath(opencl_binary_paths); + if (!precompiled_binary_path.empty()) { + opencl_binary_storage_.reset( + new FileStorage(precompiled_binary_path)); + } +} + +GPUContext::~GPUContext() = default; + +KVStorage *GPUContext::opencl_binary_storage() { + return opencl_binary_storage_.get(); +} + +KVStorage *GPUContext::opencl_cache_storage() { + return opencl_cache_storage_.get(); +} + +Tuner *GPUContext::opencl_tuner() { + return opencl_tuner_.get(); +} + +} // namespace mace diff --git a/mace/core/device_context.h b/mace/core/device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..21d076730f25b3070c0f680de5b6370d860612f9 --- /dev/null +++ b/mace/core/device_context.h @@ -0,0 +1,47 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_DEVICE_CONTEXT_H_ +#define MACE_CORE_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include + +#include "mace/core/file_storage.h" +#include "mace/utils/tuner.h" + +namespace mace { + +class GPUContext { + public: + GPUContext(const std::string &storage_path = "", + const std::vector &opencl_binary_path = {}, + const std::string &opencl_parameter_path = ""); + ~GPUContext(); + + KVStorage *opencl_binary_storage(); + KVStorage *opencl_cache_storage(); + Tuner *opencl_tuner(); + + private: + std::unique_ptr storage_factory_; + std::unique_ptr> opencl_tuner_; + std::unique_ptr opencl_binary_storage_; + std::unique_ptr opencl_cache_storage_; +}; + +} // namespace mace +#endif // MACE_CORE_DEVICE_CONTEXT_H_ diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc index 99731a813f26b9c5593b492b0f2f16ec1e653f40..7c1fb35b3ebac4df42d2aac451ccbbbf0b4de464 100644 --- a/mace/core/file_storage.cc +++ b/mace/core/file_storage.cc @@ -28,10 +28,36 @@ namespace mace { -std::shared_ptr kStorageFactory = nullptr; +class FileStorageFactory::Impl { + public: + explicit Impl(const std::string &path); + + std::unique_ptr CreateStorage(const std::string &name); + + private: + std::string path_; +}; + +FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {} + +std::unique_ptr FileStorageFactory::Impl::CreateStorage( + const std::string &name) { + return std::move(std::unique_ptr( + new FileStorage(path_ + "/" + name))); +} + +FileStorageFactory::FileStorageFactory(const std::string &path): + impl_(new FileStorageFactory::Impl(path)) {} + +FileStorageFactory::~FileStorageFactory() = default; + +std::unique_ptr FileStorageFactory::CreateStorage( + const std::string &name) { + return impl_->CreateStorage(name); +} FileStorage::FileStorage(const std::string &file_path): - data_changed_(false), file_path_(file_path) {} + loaded_(false), data_changed_(false), file_path_(file_path) {} int FileStorage::Load() { struct stat st; @@ -47,6 +73,9 @@ int FileStorage::Load() { } } utils::WriteLock lock(&data_mutex_); + if (loaded_) { + return 0; + } int fd = open(file_path_.c_str(), O_RDONLY); if (fd < 0) { if (errno == ENOENT) { @@ -118,13 +147,17 @@ int FileStorage::Load() { << " failed, error code: " << strerror(errno); return -1; } + loaded_ = true; return 0; } -void FileStorage::Clear() { +bool FileStorage::Clear() { utils::WriteLock lock(&data_mutex_); - data_.clear(); - data_changed_ = true; + if (!data_.empty()) { + data_.clear(); + data_changed_ = true; + } + return true; } bool FileStorage::Insert(const std::string &key, diff --git a/mace/core/file_storage.h b/mace/core/file_storage.h index 3b648c23379c0502d5272ee720d52d7ae792b9b2..c4efe8c3565229b371a99f59fc71345c041577bf 100644 --- a/mace/core/file_storage.h +++ b/mace/core/file_storage.h @@ -16,27 +16,64 @@ #define MACE_CORE_FILE_STORAGE_H_ #include +#include #include #include -#include "mace/public/mace_runtime.h" +#include "mace/public/mace.h" #include "mace/utils/rwlock.h" namespace mace { +class KVStorage { + public: + // return: 0 for success, -1 for error + virtual int Load() = 0; + virtual bool Clear() = 0; + // insert or update the key-value. + virtual bool Insert(const std::string &key, + const std::vector &value) = 0; + virtual const std::vector *Find(const std::string &key) = 0; + // return: 0 for success, -1 for error + virtual int Flush() = 0; + virtual ~KVStorage() {} +}; + +class KVStorageFactory { + public: + virtual std::unique_ptr CreateStorage(const std::string &name) = 0; + + virtual ~KVStorageFactory() {} +}; + +class FileStorageFactory : public KVStorageFactory { + public: + // You have to make sure your APP have read and write permission of the path. + explicit FileStorageFactory(const std::string &path); + + ~FileStorageFactory(); + + std::unique_ptr CreateStorage(const std::string &name) override; + + private: + class Impl; + std::unique_ptr impl_; +}; + class FileStorage : public KVStorage { public: explicit FileStorage(const std::string &file_path); public: int Load() override; - void Clear() override; + bool Clear() override; bool Insert(const std::string &key, const std::vector &value) override; const std::vector *Find(const std::string &key) override; int Flush() override; private: + bool loaded_; bool data_changed_; std::string file_path_; std::map> data_; diff --git a/mace/core/net.cc b/mace/core/net.cc index ec8afdd14f6a7abf8fbe4bd8d4b0bab4dd5e4e94..0c538b801bb1f9c8bcbbc109dc80fc0893255dfe 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -18,6 +18,7 @@ #include "mace/core/macros.h" #include "mace/core/net.h" +#include "mace/public/mace.h" #include "mace/utils/memory_logging.h" #include "mace/utils/timer.h" #include "mace/utils/utils.h" @@ -27,30 +28,35 @@ namespace mace { NetBase::NetBase(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type) + Device *device) : name_(net_def->name()), op_registry_(op_registry) { MACE_UNUSED(ws); - MACE_UNUSED(type); + MACE_UNUSED(device); } SerialNet::SerialNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) - : NetBase(op_registry, net_def, ws, type), device_type_(type) { + : NetBase(op_registry, net_def, ws, device), device_(device), + op_kernel_context_(new OpKernelContext(ws, device)) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); + DeviceType device_type = device->device_type(); for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto &operator_def = net_def->op(idx); // TODO(liuqi): refactor to add device_type to OperatorDef const int op_device = ProtoArgHelper::GetOptionalArg( - operator_def, "device", static_cast(device_type_)); - if (op_device == type) { + operator_def, "device", static_cast(device_type)); + if (op_device == device_type) { + VLOG(3) << "Creating operator " << operator_def.name() << "(" + << operator_def.type() << ")"; OperatorDef temp_def(operator_def); std::unique_ptr op( - op_registry->CreateOperator(temp_def, ws, type, mode)); + op_registry->CreateOperator(temp_def, op_kernel_context_.get(), + device_type, mode)); if (op) { operators_.emplace_back(std::move(op)); } @@ -61,13 +67,14 @@ SerialNet::SerialNet( MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); + const DeviceType device_type = device_->device_type(); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", op->debug_def().type(), "), mem_id: ", MakeListString(op->debug_def().mem_id().data(), op->debug_def().mem_id().size())); - bool future_wait = (device_type_ == DeviceType::GPU && + bool future_wait = (device_type == DeviceType::GPU && (run_metadata != nullptr || std::distance(iter, operators_.end()) == 1)); @@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { } else { future.wait_fn(nullptr); } +#ifdef MACE_ENABLE_OPENCL + device_->opencl_runtime()->command_queue().finish(); +#endif } else if (run_metadata != nullptr) { call_stats.start_micros = NowMicros(); MACE_RETURN_IF_ERROR(op->Run(nullptr)); @@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { VLOG(3) << "Operator " << op->debug_def().name() << " has shape: " << MakeString(op->Output(0)->shape()); - if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) { + if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) { for (int i = 0; i < op->OutputSize(); ++i) { int data_type = op->GetOptionalArg("T", static_cast(DT_FLOAT)); if (data_type == static_cast(DT_FLOAT)) { @@ -151,20 +161,20 @@ std::unique_ptr CreateNet( const std::shared_ptr op_registry, const NetDef &net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) { std::shared_ptr tmp_net_def(new NetDef(net_def)); - return CreateNet(op_registry, tmp_net_def, ws, type, mode); + return CreateNet(op_registry, tmp_net_def, ws, device, mode); } std::unique_ptr CreateNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) { std::unique_ptr net( - new SerialNet(op_registry, net_def, ws, type, mode)); + new SerialNet(op_registry, net_def, ws, device, mode)); return net; } diff --git a/mace/core/net.h b/mace/core/net.h index 0cec40594c5a12924ff3ee82595b12af4b6f689c..a63ded668e582e46d0d8a60b492478797a514cd5 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -20,7 +20,6 @@ #include #include "mace/core/operator.h" -#include "mace/public/mace.h" namespace mace { @@ -33,7 +32,7 @@ class NetBase { NetBase(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type); + Device *device); virtual ~NetBase() noexcept {} virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0; @@ -52,14 +51,15 @@ class SerialNet : public NetBase { SerialNet(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); MaceStatus Run(RunMetadata *run_metadata = nullptr) override; protected: std::vector > operators_; - DeviceType device_type_; + Device *device_; + std::unique_ptr op_kernel_context_; MACE_DISABLE_COPY_AND_ASSIGN(SerialNet); }; @@ -68,13 +68,13 @@ std::unique_ptr CreateNet( const std::shared_ptr op_registry, const NetDef &net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); std::unique_ptr CreateNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); } // namespace mace diff --git a/mace/core/op_kernel_context.cc b/mace/core/op_kernel_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..20f9e561a43ea58179818fcf03989020bf6692a5 --- /dev/null +++ b/mace/core/op_kernel_context.cc @@ -0,0 +1,32 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/op_kernel_context.h" + +namespace mace { + +OpKernelContext::OpKernelContext(Workspace *ws, Device *device) + : device_(device), ws_(ws) {} + +OpKernelContext::~OpKernelContext() = default; + +Device* OpKernelContext::device() { + return device_; +} + +Workspace* OpKernelContext::workspace() { + return ws_; +} + +} // namespace mace diff --git a/mace/core/op_kernel_context.h b/mace/core/op_kernel_context.h new file mode 100644 index 0000000000000000000000000000000000000000..fe5e777cd5b5647ccb42f684fb1363224740333e --- /dev/null +++ b/mace/core/op_kernel_context.h @@ -0,0 +1,34 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_ +#define MACE_CORE_OP_KERNEL_CONTEXT_H_ + +#include "mace/core/device.h" +#include "mace/core/workspace.h" +namespace mace { + +class OpKernelContext { + public: + OpKernelContext(Workspace *ws, Device *device); + ~OpKernelContext(); + Device *device(); + Workspace *workspace(); + private: + Device *device_; + Workspace *ws_; +}; + +} // namespace mace +#endif // MACE_CORE_OP_KERNEL_CONTEXT_H_ diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 20769fa30bfb5e87eb21fdcbb0c3b98b98365570..5e4048358bfc7717da3a09e93899800750bb157a 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -18,12 +18,15 @@ #include #include "mace/core/operator.h" +#include "mace/core/op_kernel_context.h" namespace mace { -OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) - : operator_ws_(ws), - operator_def_(std::make_shared(operator_def)) {} +OperatorBase::OperatorBase(const OperatorDef &operator_def, + OpKernelContext *context) + : operator_def_(std::make_shared(operator_def)) { + MACE_UNUSED(context); +} OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {} @@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {} std::unique_ptr OperatorRegistryBase::CreateOperator( const OperatorDef &operator_def, - Workspace *ws, + OpKernelContext *context, DeviceType type, const NetMode mode) const { const int dtype = ProtoArgHelper::GetOptionalArg( @@ -70,7 +73,7 @@ std::unique_ptr OperatorRegistryBase::CreateOperator( .Device(type) .TypeConstraint("T", static_cast(dtype)) .Build(), - operator_def, ws); + operator_def, context); } else { return nullptr; } diff --git a/mace/core/operator.h b/mace/core/operator.h index 330f8002288badec78de4d6987caff0d0762cb05..6be38890ebad2c18448ed01a69c57ff016ea2775 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -22,17 +22,17 @@ #include "mace/core/arg_helper.h" #include "mace/core/future.h" +#include "mace/core/op_kernel_context.h" #include "mace/core/registry.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/proto/mace.pb.h" -#include "mace/public/mace.h" namespace mace { class OperatorBase { public: - explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws); + explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *); virtual ~OperatorBase() noexcept {} template @@ -78,7 +78,6 @@ class OperatorBase { inline bool has_debug_def() const { return operator_def_ != nullptr; } protected: - Workspace *operator_ws_; std::shared_ptr operator_def_; std::vector inputs_; std::vector outputs_; @@ -89,8 +88,9 @@ class OperatorBase { template class Operator : public OperatorBase { public: - explicit Operator(const OperatorDef &operator_def, Workspace *ws) - : OperatorBase(operator_def, ws) { + explicit Operator(const OperatorDef &operator_def, OpKernelContext *context) + : OperatorBase(operator_def, context) { + Workspace *ws = context->workspace(); for (const std::string &input_str : operator_def.input()) { const Tensor *tensor = ws->GetTensor(input_str); MACE_CHECK(tensor != nullptr, "op ", operator_def.type(), @@ -116,7 +116,7 @@ class Operator : public OperatorBase { output_type = DataTypeToEnum::v(); } outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( - output_str, GetDeviceAllocator(D), output_type))); + output_str, context->device()->allocator(), output_type))); } } } @@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { class OperatorRegistryBase { public: - typedef Registry + typedef Registry RegistryType; OperatorRegistryBase() = default; virtual ~OperatorRegistryBase(); RegistryType *registry() { return ®istry_; } std::unique_ptr CreateOperator(const OperatorDef &operator_def, - Workspace *ws, + OpKernelContext *context, DeviceType type, const NetMode mode) const; @@ -183,7 +186,7 @@ class OperatorRegistryBase { MACE_DECLARE_REGISTRY(OpRegistry, OperatorBase, const OperatorDef &, - Workspace *); + OpKernelContext *); #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) diff --git a/mace/core/registry.h b/mace/core/registry.h index ac81328731c3178ee19798cae862086f25ed8c29..1ad92f0aab36fe130497209fe7ebe034f1e025ad 100644 --- a/mace/core/registry.h +++ b/mace/core/registry.h @@ -22,7 +22,6 @@ #include #include -#include "mace/public/mace.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index f9b1d49f2f9dad0408a3b1922c12169444aa549c..671d4cdfdbae34f6c2a4026c6bfceceebe78292c 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -30,7 +30,6 @@ #include "public/gemmlowp.h" #include "mace/core/macros.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 3382a8f1c66de2b8fa41b3420b380efc91da5ab1..83d397ee88b39a9e31d72198bc56f950969168c9 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -18,7 +18,6 @@ #include #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, CPUAffinityPolicy policy, bool use_gemmlowp = false); +class CPURuntime { + public: + explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {} + ~CPURuntime() = default; + inline int num_threads() const { + return num_threads_; + } + private: + int num_threads_; +}; } // namespace mace #endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd9e41bb16db3151e77f1742f23c8866f427810f --- /dev/null +++ b/mace/core/runtime/opencl/gpu_device.cc @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/gpu_device.h" + +namespace mace { + +GPUDevice::GPUDevice(Tuner *tuner, + KVStorage *opencl_cache_storage, + const GPUPriorityHint priority, + const GPUPerfHint perf, + KVStorage *opencl_binary_storage, + const int num_threads) : + CPUDevice(num_threads), + runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, + opencl_binary_storage, tuner)), + allocator_(new OpenCLAllocator(runtime_.get())) {} + +GPUDevice::~GPUDevice() = default; + +OpenCLRuntime* GPUDevice::opencl_runtime() { + return runtime_.get(); +} + +Allocator* GPUDevice::allocator() { + return allocator_.get(); +} + +DeviceType GPUDevice::device_type() const { + return DeviceType::GPU; +} + +} // namespace mace diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h new file mode 100644 index 0000000000000000000000000000000000000000..1526ba0ae4ed7cb3b1170f89dc786da279c925bd --- /dev/null +++ b/mace/core/runtime/opencl/gpu_device.h @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ +#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ + +#include + +#include "mace/core/device_context.h" +#include "mace/core/device.h" +#include "mace/core/runtime/opencl/opencl_allocator.h" + +namespace mace { + +class GPUDevice : public CPUDevice { + public: + GPUDevice(Tuner *tuner, + KVStorage *opencl_cache_storage = nullptr, + const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW, + const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL, + KVStorage *opencl_binary_storage = nullptr, + const int num_threads = -1); + ~GPUDevice(); + OpenCLRuntime *opencl_runtime() override; + Allocator *allocator() override; + DeviceType device_type() const override; + private: + std::unique_ptr runtime_; + std::unique_ptr allocator_; +}; + +} // namespace mace +#endif // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index 86b0138d727da41171c315fde3e121d88877fb04..c22e4f8f5663a27b9596915dad1f64864c8f3ec9 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "mace/core/runtime/opencl/opencl_allocator.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { @@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { } } // namespace -OpenCLAllocator::OpenCLAllocator() {} +OpenCLAllocator::OpenCLAllocator( + OpenCLRuntime *opencl_runtime): + opencl_runtime_(opencl_runtime) {} OpenCLAllocator::~OpenCLAllocator() {} MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { @@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { } cl_int error; - cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), + cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nbytes, nullptr, &error); if (error != CL_SUCCESS) { @@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector &image_shape, cl_int error; cl::Image2D *cl_image = - new cl::Image2D(OpenCLRuntime::Global()->context(), + new cl::Image2D(opencl_runtime_->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format, image_shape[0], image_shape[1], 0, nullptr, &error); if (error != CL_SUCCESS) { @@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const { } void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { + VLOG(3) << "Map OpenCL buffer"; auto cl_buffer = static_cast(buffer); - auto queue = OpenCLRuntime::Global()->command_queue(); + auto queue = opencl_runtime_->command_queue(); // TODO(heliangliang) Non-blocking call cl_int error; void *mapped_ptr = @@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { void *OpenCLAllocator::MapImage(void *buffer, const std::vector &image_shape, std::vector *mapped_image_pitch) const { - MACE_CHECK(image_shape.size() == 2, "Just support map 2d image"); + VLOG(3) << "Map OpenCL Image"; + MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; auto cl_image = static_cast(buffer); std::array origin = {0, 0, 0}; std::array region = {image_shape[0], image_shape[1], 1}; mapped_image_pitch->resize(2); cl_int error; - void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage( + void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage( *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr, nullptr, &error); @@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer, } void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { + VLOG(3) << "Unmap OpenCL buffer/Image"; auto cl_buffer = static_cast(buffer); - auto queue = OpenCLRuntime::Global()->command_queue(); + auto queue = opencl_runtime_->command_queue(); cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr, nullptr); if (error != CL_SUCCESS) { diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h index 6304add8583f7b2e47c58cd6e6b186ea43b7f092..d2b7556beb09086ca8091dbd70eb4566c62414a6 100644 --- a/mace/core/runtime/opencl/opencl_allocator.h +++ b/mace/core/runtime/opencl/opencl_allocator.h @@ -15,15 +15,17 @@ #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ +#include #include #include "mace/core/allocator.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { class OpenCLAllocator : public Allocator { public: - OpenCLAllocator(); + explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime); ~OpenCLAllocator() override; @@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator { void Unmap(void *buffer, void *mapped_ptr) const override; bool OnHost() const override; + + private: + OpenCLRuntime *opencl_runtime_; }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 305337034d98622db928d9133c4cc69597900ffe..967a040f01395d1ea13b25d6c1a1c67650a95c3b 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -24,11 +24,9 @@ #include #include -#include "mace/public/mace_runtime.h" #include "mace/core/macros.h" #include "mace/core/file_storage.h" #include "mace/core/runtime/opencl/opencl_extension.h" -#include "mace/public/mace.h" #include "mace/utils/tuner.h" namespace mace { @@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector &paths) { const char *kOpenCLPlatformInfoKey = "mace_opencl_precompiled_platform_info_key"; -const char *kPrecompiledProgramFileName = - "mace_cl_compiled_program.bin"; } // namespace void OpenCLProfilingTimer::StartTiming() {} void OpenCLProfilingTimer::StopTiming() { - OpenCLRuntime::Global()->command_queue().finish(); + runtime_->command_queue().finish(); start_nanos_ = event_->getProfilingInfo(); stop_nanos_ = event_->getProfilingInfo(); } @@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() { accumulated_micros_ = 0; } -GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL; -GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = - GPUPriorityHint::PRIORITY_DEFAULT; -std::string - OpenCLRuntime::kPrecompiledBinaryPath = ""; // NOLINT(runtime/string) - -OpenCLRuntime *OpenCLRuntime::Global() { - static OpenCLRuntime runtime; - return &runtime; -} - -void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint, - GPUPriorityHint gpu_priority_hint) { - OpenCLRuntime::kGPUPerfHint = gpu_perf_hint; - OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint; -} - -void OpenCLRuntime::ConfigureOpenCLBinaryPath( - const std::vector &paths) { - OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths); - if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { - LOG(WARNING) << "There is no precompiled OpenCL binary file in " - << MakeString(paths); - } -} - -OpenCLRuntime::OpenCLRuntime(): - precompiled_binary_storage_(nullptr), - cache_storage_(nullptr), +OpenCLRuntime::OpenCLRuntime( + KVStorage *cache_storage, + const GPUPriorityHint priority_hint, + const GPUPerfHint perf_hint, + KVStorage *precompiled_binary_storage, + Tuner *tuner): + cache_storage_(cache_storage), + precompiled_binary_storage_(precompiled_binary_storage), + tuner_(tuner), is_opencl_avaliable_(false), is_profiling_enabled_(false), opencl_version_(CL_VER_UNKNOWN), @@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime(): cl_command_queue_properties properties = 0; const char *profiling = getenv("MACE_OPENCL_PROFILING"); - if (Tuner::Get()->IsTuning() || + if (IsTuning() || (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { properties |= CL_QUEUE_PROFILING_ENABLE; is_profiling_enabled_ = true; @@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime(): std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, - OpenCLRuntime::kGPUPerfHint, - OpenCLRuntime::kGPUPriorityHint); + perf_hint, + priority_hint); context_ = std::shared_ptr( new cl::Context({*device_}, context_properties.data(), nullptr, nullptr, &err)); @@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime(): return; } - extern std::shared_ptr kStorageFactory; std::string cached_binary_platform_info; - if (kStorageFactory != nullptr) { - cache_storage_ = - kStorageFactory->CreateStorage(kPrecompiledProgramFileName); - + if (cache_storage_ != nullptr) { if (cache_storage_->Load() != 0) { LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. " << "Please make sure the storage directory exist " @@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime(): } if (cached_binary_platform_info != platform_info_) { - if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) { - precompiled_binary_storage_.reset( - new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); + if (precompiled_binary_storage_ == nullptr) { + VLOG(1) << "There is no precompiled OpenCL binary in" + " all OpenCL binary paths."; + } else { if (precompiled_binary_storage_->Load() != 0) { LOG(WARNING) << "Load OpenCL precompiled kernel file failed. " << "Please make sure the storage directory exist " @@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } +Tuner *OpenCLRuntime::tuner() { return tuner_; } + uint64_t OpenCLRuntime::device_global_mem_cache_size() const { return device_gloabl_mem_cache_size_; } diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 537707fa654fea9dce01c48297d802cdbc27bd2a..222fe8514a4cf4b08c944959e2faf8f646bf5c29 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -22,11 +22,12 @@ #include #include +#include "mace/core/file_storage.h" #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/string_util.h" #include "mace/utils/timer.h" +#include "mace/utils/tuner.h" namespace mace { @@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error); return MaceStatus::MACE_OUT_OF_RESOURCES; \ } -class OpenCLProfilingTimer : public Timer { - public: - explicit OpenCLProfilingTimer(const cl::Event *event) - : event_(event), accumulated_micros_(0) {} - void StartTiming() override; - void StopTiming() override; - void AccumulateTiming() override; - void ClearTiming() override; - double ElapsedMicros() override; - double AccumulatedMicros() override; - - private: - const cl::Event *event_; - double start_nanos_; - double stop_nanos_; - double accumulated_micros_; -}; - class OpenCLRuntime { public: - static OpenCLRuntime *Global(); - static void Configure(GPUPerfHint, GPUPriorityHint); - static void ConfigureOpenCLBinaryPath(const std::vector &paths); + OpenCLRuntime( + KVStorage *cache_storage = nullptr, + const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL, + const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL, + KVStorage *precompiled_binary_storage = nullptr, + Tuner *tuner = nullptr); + ~OpenCLRuntime(); + OpenCLRuntime(const OpenCLRuntime &) = delete; + OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; cl::Context &context(); cl::Device &device(); @@ -91,6 +80,7 @@ class OpenCLRuntime { const std::string platform_info() const; uint64_t device_global_mem_cache_size() const; uint32_t device_compute_units() const; + Tuner *tuner(); bool is_opencl_avaliable(); void GetCallStats(const cl::Event &event, CallStats *stats); @@ -112,11 +102,6 @@ class OpenCLRuntime { void SaveBuiltCLProgram(); private: - OpenCLRuntime(); - ~OpenCLRuntime(); - OpenCLRuntime(const OpenCLRuntime &) = delete; - OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; - bool BuildProgram(const std::string &program_file_name, const std::string &binary_file_name, const std::string &build_options, @@ -137,10 +122,13 @@ class OpenCLRuntime { OpenCLVersion ParseDeviceVersion(const std::string &device_version); private: - std::unique_ptr precompiled_binary_storage_; - std::unique_ptr cache_storage_; + KVStorage *cache_storage_; + KVStorage *precompiled_binary_storage_; + Tuner *tuner_; bool is_opencl_avaliable_; bool is_profiling_enabled_; + OpenCLVersion opencl_version_; + GPUType gpu_type_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. std::shared_ptr context_; @@ -149,18 +137,30 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string platform_info_; - OpenCLVersion opencl_version_; std::string precompiled_binary_platform_info_; bool out_of_range_check_; uint64_t device_gloabl_mem_cache_size_; uint32_t device_compute_units_; - GPUType gpu_type_; - - static GPUPerfHint kGPUPerfHint; - static GPUPriorityHint kGPUPriorityHint; - static std::string kPrecompiledBinaryPath; }; +class OpenCLProfilingTimer : public Timer { + public: + OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event) + : runtime_(runtime), event_(event), accumulated_micros_(0) {} + void StartTiming() override; + void StopTiming() override; + void AccumulateTiming() override; + void ClearTiming() override; + double ElapsedMicros() override; + double AccumulatedMicros() override; + + private: + OpenCLRuntime *runtime_; + const cl::Event *event_; + double start_nanos_; + double stop_nanos_; + double accumulated_micros_; +}; } // namespace mace #endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 62ea5488a87f53233c049915c8170ff8eb41d709..f7e509876f1564b06cbcd94e433a8ca3c03e197f 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -25,7 +25,6 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" #endif -#include "mace/public/mace.h" #include "mace/utils/logging.h" #ifdef MACE_ENABLE_NEON @@ -38,10 +37,10 @@ namespace mace { #define MACE_SINGLE_ARG(...) __VA_ARGS__ -#define MACE_CASE(TYPE, STATEMENTS) \ +#define MACE_CASE(TYPE, STATEMENTS) \ case DataTypeToEnum::value: { \ typedef TYPE T; \ - STATEMENTS; \ + STATEMENTS; \ break; \ } @@ -137,7 +136,7 @@ class Tensor { buffer_ = &buffer_slice_; } - Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {} + Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {} ~Tensor() { if (is_buffer_owner_ && buffer_ != nullptr) { @@ -270,7 +269,7 @@ class Tensor { image_shape_ = image_shape; if (buffer_ == nullptr) { MACE_CHECK(is_buffer_owner_); - buffer_ = new Image(); + buffer_ = new Image(allocator_); return buffer_->Allocate(image_shape, dtype_); } else { MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index 48a6928d40ad0ddf755ebcc311f9a125756e47e7..569a8345c147a763a2c2036b4ac082e60caed856 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -16,15 +16,10 @@ #include "gflags/gflags.h" #include "mace/core/runtime/cpu/cpu_runtime.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); -DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(cpu_affinity_policy, 1, "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); @@ -43,10 +38,6 @@ int main(int argc, char **argv) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } - mace::OpenCLRuntime::Configure( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - mace::testing::Benchmark::Run(FLAGS_filter.c_str()); return 0; } diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 07d855605ed744d64345ab722225a274bc09063c..4c9204cbf085acda7f4a9497da2a5c80afab88f0 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/workspace.h" + +#include #include #include #include @@ -21,8 +24,6 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_runtime.h" #endif -#include "mace/core/workspace.h" -#include "mace/utils/timer.h" namespace mace { @@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) { } } // namespace -Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer( - GetDeviceAllocator(DeviceType::CPU))) {} +Workspace::Workspace() : + host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {} Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, @@ -74,7 +75,7 @@ std::vector Workspace::Tensors() const { } MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, - DeviceType type, + Device *device, const unsigned char *model_data) { MACE_LATENCY_LOGGER(1, "Load model tensors"); index_t model_data_size = 0; @@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } VLOG(3) << "Model data size: " << model_data_size; + const DeviceType device_type = device->device_type(); + if (model_data_size > 0) { #ifdef MACE_ENABLE_OPENCL - if (type == DeviceType::GPU && - OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <= + if (device_type == DeviceType::GPU && + device->opencl_runtime()->GetDeviceMaxMemAllocSize() <= static_cast(model_data_size)) { for (auto &const_tensor : net_def.tensors()) { MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); @@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } std::unique_ptr tensor( - new Tensor(GetDeviceAllocator(type), + new Tensor(device->allocator(), const_tensor.data_type(), true)); tensor->Resize(dims); @@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, #else { #endif - if (type == DeviceType::CPU) { + if (device_type == DeviceType::CPU) { tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type), + new Buffer(device->allocator(), const_cast(model_data), model_data_size)); } else { tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type))); + new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); tensor_buffer_->Map(nullptr); tensor_buffer_->Copy(const_cast(model_data), @@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } } - if (type == DeviceType::CPU || type == DeviceType::GPU) { - MaceStatus status = CreateOutputTensorBuffer(net_def, type); + if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) { + MaceStatus status = CreateOutputTensorBuffer(net_def, device); if (status != MaceStatus::MACE_SUCCESS) return status; } - if (type == DeviceType::CPU && net_def.has_quantize_info()) { + if (device_type == DeviceType::CPU && net_def.has_quantize_info()) { for (const auto &activation_info: net_def.quantize_info().activation_info()) { if (HasTensor(activation_info.tensor_name())) { @@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, - DeviceType device_type) { + Device *device) { + DeviceType device_type = device->device_type(); DataType dtype = DataType::DT_INVALID; if (net_def.mem_arena().mem_block_size() > 0) { // We use the data type of the first op with mem id, @@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, << ", memory type: " << mem_block.mem_type(); if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::CPU))); + new Buffer(GetCPUAllocator())); MACE_RETURN_IF_ERROR(tensor_buf->Allocate( mem_block.x() * GetEnumTypeSize(dtype) + MACE_EXTRA_BUFFER_PAD_SIZE)); @@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::move(tensor_buf)); } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { std::unique_ptr image_buf( - new Image()); + new Image(device->allocator())); MACE_RETURN_IF_ERROR(image_buf->Allocate( {mem_block.x(), mem_block.y()}, dtype)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::GPU))); + new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buf->Allocate( mem_block.x() * GetEnumTypeSize(dtype))); preallocated_allocator_.SetBuffer(mem_block.mem_id(), @@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, op, "T", static_cast(DT_FLOAT))); } CreateTensor(op.output(i), - GetDeviceAllocator(device_type), + device->allocator(), output_type); } } @@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() { } void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, - const unsigned char *model_data) { + const unsigned char *model_data, + Allocator *alloc) { for (auto &const_tensor : net_def.tensors()) { auto iter = tensor_map_.find(const_tensor.name()); if (iter->second->unused()) { @@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, dims.push_back(d); } std::unique_ptr tensor( - new Tensor(GetDeviceAllocator(DeviceType::GPU), - const_tensor.data_type())); + new Tensor(alloc, const_tensor.data_type())); tensor->Resize(dims); MACE_CHECK(tensor->size() == const_tensor.data_size(), "Tensor's data_size not equal with the shape"); diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 20f214b0018a93b59b84d8bf4cae7004e4e6ba0d..71850098e03593083454acc7102743a0cc106f1b 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -20,6 +20,7 @@ #include #include +#include "mace/core/device.h" #include "mace/core/preallocated_pooled_allocator.h" #include "mace/core/tensor.h" #include "mace/public/mace.h" @@ -48,7 +49,7 @@ class Workspace { std::vector Tensors() const; MaceStatus LoadModelTensor(const NetDef &net_def, - DeviceType type, + Device *device, const unsigned char *model_data); ScratchBuffer *GetScratchBuffer(DeviceType device_type); @@ -56,11 +57,14 @@ class Workspace { void RemoveUnusedBuffer(); void RemoveAndReloadBuffer(const NetDef &net_def, - const unsigned char *model_data); + const unsigned char *model_data, + Allocator *alloc); private: MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, - DeviceType device_type); + Device *device); + + Device *device_; TensorMap tensor_map_; diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java index edd7bf1802012012a0881fb82e00924a1c109405..5788801c0277a1d24834dd6d8470cd2d02c8f939 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java @@ -37,15 +37,13 @@ public class AppModel { mJniThread = new Handler(thread.getLooper()); } - public void maceMobilenetSetAttrs(final InitData initData) { + public void maceMobilenetCreateGPUContext(final InitData initData) { mJniThread.post(new Runnable() { @Override public void run() { - int result = JniMaceUtils.maceMobilenetSetAttrs( - initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(), - initData.getGpuPerfHint(), initData.getGpuPriorityHint(), - initData.getKernelPath()); - Log.i("APPModel", "maceMobilenetSetAttrs result = " + result); + int result = JniMaceUtils.maceMobilenetCreateGPUContext( + initData.getStoragePath()); + Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result); } }); } @@ -54,7 +52,10 @@ public class AppModel { mJniThread.post(new Runnable() { @Override public void run() { - int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice()); + int result = JniMaceUtils.maceMobilenetCreateEngine( + initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(), + initData.getGpuPerfHint(), initData.getGpuPriorityHint(), + initData.getModel(), initData.getDevice()); Log.i("APPModel", "maceMobilenetCreateEngine result = " + result); if (result == -1) { diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java index ab62a90fcf9227398b7fb8ab159a09ec3984aed1..f8adafc845c37e341e084fa208085dfaeaf4c3a2 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java @@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap } private void initJni() { - AppModel.instance.maceMobilenetSetAttrs(initData); + AppModel.instance.maceMobilenetCreateGPUContext(initData); AppModel.instance.maceMobilenetCreateEngine(initData, this); } diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java index ab0f54b587bc9bbcd574c12b6f449172cade1689..ffcbde9605a841f959fcec3ba5016618f004e223 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java @@ -29,7 +29,7 @@ public class InitData { private int cpuAffinityPolicy; private int gpuPerfHint; private int gpuPriorityHint; - private String kernelPath = ""; + private String storagePath = ""; public InitData() { model = MODELS[0]; @@ -38,8 +38,8 @@ public class InitData { gpuPerfHint = 3; gpuPriorityHint = 3; device = DEVICES[0]; - kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace"; - File file = new File(kernelPath); + storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace"; + File file = new File(storagePath); if (!file.exists()) { file.mkdir(); } @@ -94,11 +94,11 @@ public class InitData { this.gpuPriorityHint = gpuPriorityHint; } - public String getKernelPath() { - return kernelPath; + public String getStoragePath() { + return storagePath; } - public void setKernelPath(String kernelPath) { - this.kernelPath = kernelPath; + public void setStoragePath(String storagePath) { + this.storagePath = storagePath; } } diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc index 0a0702702ae0f18ad31998fa562e0d0fa7237d16..4ccba56efa6b1a8de476eb6c7c7e00632136099f 100755 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc @@ -26,7 +26,6 @@ #include #include "src/main/cpp/include/mace/public/mace.h" -#include "src/main/cpp/include/mace/public/mace_runtime.h" #include "src/main/cpp/include/mace/public/mace_engine_factory.h" namespace { @@ -39,8 +38,8 @@ struct ModelInfo { }; struct MaceContext { + std::shared_ptr gpu_context; std::shared_ptr engine; - std::shared_ptr storage_factory; std::string model_name; mace::DeviceType device_type = mace::DeviceType::CPU; std::map model_infos = { @@ -72,48 +71,65 @@ MaceContext& GetMaceContext() { } // namespace -JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs( - JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, - jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) { +JNIEXPORT jint JNICALL +Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext( + JNIEnv *env, jclass thisObj, jstring storage_path) { MaceContext &mace_context = GetMaceContext(); + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr); + if (storage_path_ptr == nullptr) return JNI_ERR; + const std::string storage_file_path(storage_path_ptr); + env->ReleaseStringUTFChars(storage_path, storage_path_ptr); - mace::MaceStatus status; - // openmp - status = mace::SetOpenMPThreadPolicy( - omp_num_threads, - static_cast(cpu_affinity_policy)); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "openmp result: %d, threads: %d, cpu: %d", - status, omp_num_threads, cpu_affinity_policy); - - // gpu - mace::SetGPUHints( - static_cast(gpu_perf_hint), - static_cast(gpu_priority_hint)); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "gpu perf: %d, priority: %d", - gpu_perf_hint, gpu_priority_hint); - - // opencl cache - const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr); - if (kernel_path_ptr == nullptr) return JNI_ERR; - const std::string kernel_file_path(kernel_path_ptr); - mace_context.storage_factory.reset( - new mace::FileStorageFactory(kernel_file_path)); - mace::SetKVStorageFactory(mace_context.storage_factory); - env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr); + mace_context.gpu_context = mace::GPUContextBuilder() + .SetStoragePath(storage_file_path) + .Finalize(); return JNI_OK; } JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( - JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) { + JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, + jint gpu_perf_hint, jint gpu_priority_hint, + jstring model_name_str, jstring device) { MaceContext &mace_context = GetMaceContext(); + + // get device + const char *device_ptr = env->GetStringUTFChars(device, nullptr); + if (device_ptr == nullptr) return JNI_ERR; + mace_context.device_type = ParseDeviceType(device_ptr); + env->ReleaseStringUTFChars(device, device_ptr); + + // create MaceEngineConfig + mace::MaceStatus status; + mace::MaceEngineConfig config(mace_context.device_type); + status = config.SetCPUThreadPolicy( + omp_num_threads, + static_cast(cpu_affinity_policy)); + if (status != mace::MACE_SUCCESS) { + __android_log_print(ANDROID_LOG_ERROR, + "image_classify attrs", + "openmp result: %d, threads: %d, cpu: %d", + status, omp_num_threads, cpu_affinity_policy); + } + if (mace_context.device_type == mace::DeviceType::GPU) { + config.SetGPUContext(mace_context.gpu_context); + config.SetGPUHints( + static_cast(gpu_perf_hint), + static_cast(gpu_priority_hint)); + __android_log_print(ANDROID_LOG_INFO, + "image_classify attrs", + "gpu perf: %d, priority: %d", + gpu_perf_hint, gpu_priority_hint); + } + + __android_log_print(ANDROID_LOG_INFO, + "image_classify attrs", + "device: %d", + mace_context.device_type); + // parse model name const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr); if (model_name_ptr == nullptr) return JNI_ERR; @@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( std::vector input_names = {model_info_iter->second.input_name}; std::vector output_names = {model_info_iter->second.output_name}; - // get device - const char *device_ptr = env->GetStringUTFChars(device, nullptr); - if (device_ptr == nullptr) return JNI_ERR; - mace_context.device_type = ParseDeviceType(device_ptr); - env->ReleaseStringUTFChars(device, device_ptr); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "device: %d", - mace_context.device_type); - mace::MaceStatus create_engine_status = CreateMaceEngineFromCode(mace_context.model_name, std::string(), input_names, output_names, - mace_context.device_type, + config, &mace_context.engine); - __android_log_print(ANDROID_LOG_ERROR, + __android_log_print(ANDROID_LOG_INFO, "image_classify attrs", "create result: %d", create_engine_status); diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h index bef7417bcb945c50bfad6673f1a11eb3b551387e..5114eb911af090f3b39ea45c931314530ba1e3ca 100644 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h @@ -24,11 +24,13 @@ extern "C" { #endif /* * Class: com_xiaomi_mace_JniMaceUtils - * Method: maceMobilenetSetAttrs + * Method: maceMobilenetCreateGPUContext * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I */ -JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs - (JNIEnv *, jclass, jint, jint, jint, jint, jstring); +JNIEXPORT jint JNICALL +Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *, + jclass, + jstring); /* * Class: com_xiaomi_mace_JniMaceUtils @@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs */ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine - (JNIEnv *, jclass, jstring, jstring); + (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring); /* * Class: com_xiaomi_mace_JniMaceUtils diff --git a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java index f9ab7a7af7c9358dee896ec9b5bec5152f516fce..e776c013189f64b902feec7592fe53b5f0c41308 100644 --- a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java +++ b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java @@ -20,9 +20,9 @@ public class JniMaceUtils { System.loadLibrary("mace_mobile_jni"); } - public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath); + public static native int maceMobilenetCreateGPUContext(String storagePath); - public static native int maceMobilenetCreateEngine(String model, String device); + public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device); public static native float[] maceMobilenetClassify(float[] input); diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 4892baf2e3a1dbc69f688b76a34f7d7a7a21f7dd..99436fa4876bcb1731dab73aac39f70ea8ef136a 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -21,7 +21,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" // if convert model to code. #ifdef MODEL_GRAPH_FORMAT_CODE #include "mace/codegen/engine/mace_engine_factory.h" @@ -157,40 +156,40 @@ bool RunModel(const std::vector &input_names, const std::vector> &output_shapes) { // load model DeviceType device_type = ParseDeviceType(FLAGS_device); - // config runtime - mace::SetOpenMPThreadPolicy( + // configuration + // Detailed information please see mace.h + MaceStatus status; + MaceEngineConfig config(device_type); + status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); + if (status != MACE_SUCCESS) { + std::cerr << "Set openmp or cpu affinity failed." << std::endl; + } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - - // Just call once. (Not thread-safe) - // Set paths of Generated OpenCL Compiled Kernel Binary file - // if you build gpu library of specific soc. - // Using OpenCL binary will speed up the initialization. - // OpenCL binary is corresponding to the OpenCL Driver version, - // you should update the binary when OpenCL Driver changed. + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - // DO NOT USE tmp directory. - // Please use APP's own directory and make sure the directory exists. - // Just call once - const std::string internal_storage_path = - "/data/local/tmp/mace_run/interior"; - - // Config internal kv storage factory. - std::shared_ptr storage_factory( - new FileStorageFactory(internal_storage_path)); - SetKVStorageFactory(storage_factory); - // Create Engine std::shared_ptr engine; MaceStatus create_engine_status; @@ -204,7 +203,7 @@ bool RunModel(const std::vector &input_names, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else std::vector model_pb_data; @@ -216,7 +215,7 @@ bool RunModel(const std::vector &input_names, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 51383ad44fe0c737acc3229287921e7c47b0173e..3159684d109b96c09e3909ffc296f083c0278a4d 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -126,10 +127,14 @@ template class ActivationFunctor; template <> -class ActivationFunctor { +class ActivationFunctor : OpKernel { public: - ActivationFunctor(ActivationType type, float relux_max_limit) - : activation_(type), relux_max_limit_(relux_max_limit) {} + ActivationFunctor(OpKernelContext *context, + ActivationType type, + float relux_max_limit) + : OpKernel(context), + activation_(type), + relux_max_limit_(relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *alpha, @@ -159,10 +164,14 @@ class ActivationFunctor { #ifdef MACE_ENABLE_OPENCL template -class ActivationFunctor { +class ActivationFunctor : OpKernel { public: - ActivationFunctor(ActivationType type, T relux_max_limit) - : activation_(type), relux_max_limit_(static_cast(relux_max_limit)) {} + ActivationFunctor(OpKernelContext *context, + ActivationType type, + T relux_max_limit) + : OpKernel(context), + activation_(type), + relux_max_limit_(static_cast(relux_max_limit)) {} MaceStatus operator()(const Tensor *input, const Tensor *alpha, diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 2215343f5c092870f4a400061afc857c96f3a465..d81f25a337410d1225f9d8e49e071e496372d79a 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -24,6 +24,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -35,10 +36,11 @@ namespace kernels { constexpr int kCostPerGroup = 1024; template -struct AddNFunctor { +struct AddNFunctor : OpKernel { + explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) { + Tensor *output_tensor, + StatsFuture *future) { MACE_UNUSED(future); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0])); index_t size = output_tensor->size(); @@ -95,7 +97,8 @@ struct AddNFunctor { #ifdef MACE_ENABLE_OPENCL template -struct AddNFunctor { +struct AddNFunctor : OpKernel { + explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future); diff --git a/mace/kernels/argmax.h b/mace/kernels/argmax.h index 54edc3ee7b718a69b7b7136dbba587f07d654997..36218d627fce5f220cd89120728e73887155fb16 100644 --- a/mace/kernels/argmax.h +++ b/mace/kernels/argmax.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" @@ -30,7 +31,8 @@ namespace mace { namespace kernels { template -struct ArgMaxFunctor { +struct ArgMaxFunctor : OpKernel { + explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const Tensor *axis, Tensor *output, diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc index 166b67a5e4a33fd5165d4b3b8ec7de9ed0f683d4..1313543220580b896965bc3ef240e31b6edc3b09 100644 --- a/mace/kernels/arm/conv_winograd_test.cc +++ b/mace/kernels/arm/conv_winograd_test.cc @@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) { index_t filter_size = 3 * 3 * in_channels * out_channels; index_t output_size = batch * out_channels * out_height * out_width; - Tensor input; - Tensor filter; - Tensor output; - Tensor output_ref; + Tensor input(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor output(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT); input.Resize({batch, in_channels, in_height, in_width}); filter.Resize({out_channels, in_channels, 3, 3}); diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 6f934e6b14484475c96d6ffa34ce43deb3e0ebbe..4c9aac3a6415fdd8bc60e1af34ca3d51e9ca9a12 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -33,11 +33,13 @@ namespace mace { namespace kernels { -struct BatchNormFunctorBase { - BatchNormFunctorBase(bool folded_constant, +struct BatchNormFunctorBase : OpKernel { + BatchNormFunctorBase(OpKernelContext *context, + bool folded_constant, const ActivationType activation, const float relux_max_limit) - : folded_constant_(folded_constant), + : OpKernel(context), + folded_constant_(folded_constant), activation_(activation), relux_max_limit_(relux_max_limit) {} @@ -51,10 +53,14 @@ struct BatchNormFunctor; template<> struct BatchNormFunctor : BatchNormFunctorBase { - BatchNormFunctor(const bool folded_constant, + BatchNormFunctor(OpKernelContext *context, + const bool folded_constant, const ActivationType activation, const float relux_max_limit) - : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} + : BatchNormFunctorBase(context, + folded_constant, + activation, + relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *scale, @@ -132,10 +138,14 @@ struct BatchNormFunctor : BatchNormFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct BatchNormFunctor : BatchNormFunctorBase { - BatchNormFunctor(const bool folded_constant, + BatchNormFunctor(OpKernelContext *context, + const bool folded_constant, const ActivationType activation, const float relux_max_limit) - : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} + : BatchNormFunctorBase(context, + folded_constant, + activation, + relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index 1cd8421ccbf1c38a7b6cb86255cb1b652714b938..e2ea8ccfb88308e6fcfa3de731e6905fe970b13b 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,10 +31,10 @@ namespace mace { namespace kernels { -struct BiasAddFunctorBase { - explicit BiasAddFunctorBase(const DataFormat data_format) { - data_format_ = data_format; - } +struct BiasAddFunctorBase : OpKernel { + BiasAddFunctorBase(OpKernelContext *context, + const DataFormat data_format) + : OpKernel(context), data_format_(data_format) {} DataFormat data_format_; }; @@ -43,8 +44,9 @@ struct BiasAddFunctor; template <> struct BiasAddFunctor : BiasAddFunctorBase { - explicit BiasAddFunctor(const DataFormat data_format) - : BiasAddFunctorBase(data_format) {} + BiasAddFunctor(OpKernelContext *context, + const DataFormat data_format) + : BiasAddFunctorBase(context, data_format) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, @@ -96,8 +98,8 @@ struct BiasAddFunctor : BiasAddFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct BiasAddFunctor : BiasAddFunctorBase { - explicit BiasAddFunctor(const DataFormat data_format) - : BiasAddFunctorBase(data_format) {} + BiasAddFunctor(OpKernelContext *context, const DataFormat data_format) + : BiasAddFunctorBase(context, data_format) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, Tensor *output, diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index 1def908705686085ac5e5a73f9e022e6f4df27e1..4a2f731b0e49baee5db257998e1d82c665a0aee2 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -20,21 +20,24 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { -struct BufferToImageFunctorBase { - explicit BufferToImageFunctorBase(const int wino_blk_size) - : wino_blk_size_(wino_blk_size) {} +struct BufferToImageFunctorBase : OpKernel { + explicit BufferToImageFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(const int wino_blk_size) - : BufferToImageFunctorBase(wino_blk_size) {} + explicit BufferToImageFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferToImageFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(const int wino_blk_size) - : BufferToImageFunctorBase(wino_blk_size) {} + explicit BufferToImageFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferToImageFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index 920e1e1a2bf96bcc7d55009244bda693050f3780..029eb1c66b665baed39cacec05c9dbe9b45ca1b5 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -20,13 +20,15 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { template -struct ChannelShuffleFunctor { - explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} +struct ChannelShuffleFunctor : OpKernel { + ChannelShuffleFunctor(OpKernelContext *context, const int groups) + : OpKernel(context), groups_(groups) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -70,8 +72,9 @@ struct ChannelShuffleFunctor { #ifdef MACE_ENABLE_OPENCL template -struct ChannelShuffleFunctor { - explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} +struct ChannelShuffleFunctor : OpKernel { + ChannelShuffleFunctor(OpKernelContext *context, const int groups) + : OpKernel(context), groups_(groups) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 1728ca088bb8c0161191eba6cf5eef5459bb7139..696d4ff034c852ff9fdfd2f38b0682fc8b2dfe50 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,15 +31,17 @@ namespace mace { namespace kernels { -struct ConcatFunctorBase { - explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {} +struct ConcatFunctorBase : OpKernel { + ConcatFunctorBase(OpKernelContext *context, const int32_t axis) + : OpKernel(context), axis_(axis) {} int32_t axis_; }; template struct ConcatFunctor : ConcatFunctorBase { - explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} + ConcatFunctor(OpKernelContext *context, const int32_t axis) + : ConcatFunctorBase(context, axis) {} MaceStatus operator()(const std::vector &input_list, Tensor *output, @@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct ConcatFunctor : ConcatFunctorBase { - explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} + ConcatFunctor(OpKernelContext *context, const int32_t axis) + : ConcatFunctorBase(context, axis) {} MaceStatus operator()(const std::vector &input_list, Tensor *output, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 282472bca3829cd112a0be0d62a5409fdb3bbfc5..ce9bb11d24807261e556ad5b6e4c6347d1ba6eab 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -42,14 +42,16 @@ namespace mace { namespace kernels { -struct Conv2dFunctorBase { - Conv2dFunctorBase(const int *strides, +struct Conv2dFunctorBase : OpKernel { + Conv2dFunctorBase(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), dilations_(dilations), @@ -69,7 +71,8 @@ struct Conv2dFunctor; template<> struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -77,12 +80,14 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, activation, relux_max_limit), + transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT), is_filter_transformed_(is_filter_transformed), scratch_(scratch) {} @@ -721,7 +726,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { template<> struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -729,7 +735,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -949,7 +956,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -957,7 +965,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -968,10 +977,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { } MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; diff --git a/mace/kernels/crop.h b/mace/kernels/crop.h index 241584e849906a49a979002871e431c82c6503ed..6ad9650ee406d13a8ca2b64b41fadd81ce462ca6 100644 --- a/mace/kernels/crop.h +++ b/mace/kernels/crop.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,10 +31,12 @@ namespace mace { namespace kernels { -struct CropFunctorBase { - CropFunctorBase(const int axis, +struct CropFunctorBase : OpKernel { + CropFunctorBase(OpKernelContext *context, + const int axis, const std::vector &offset) - : axis_(axis), + : OpKernel(context), + axis_(axis), offset_(offset) {} const int axis_; @@ -42,8 +45,10 @@ struct CropFunctorBase { template struct CropFunctor : CropFunctorBase { - CropFunctor(const int axis, const std::vector &offset) - : CropFunctorBase(axis, offset) {} + CropFunctor(OpKernelContext *context, + const int axis, + const std::vector &offset) + : CropFunctorBase(context, axis, offset) {} void crop_copy(const T* input_data, T* output_data, const std::vector &input_shape, @@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct CropFunctor : CropFunctorBase { - CropFunctor(const int axis, const std::vector &offset) - : CropFunctorBase(axis, offset) {} + CropFunctor(OpKernelContext *context, + const int axis, + const std::vector &offset) + : CropFunctorBase(context, axis, offset) {} MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future); + Tensor *output, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 9450104d5abaf7f99a1d97c1d6fff11505562252..4bfc4d613a1454624e1a373f677ebd4df29c1db9 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input, } } // namespace deconv -struct Deconv2dFunctorBase { - Deconv2dFunctorBase(const std::vector &strides, +struct Deconv2dFunctorBase : OpKernel { + Deconv2dFunctorBase(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), output_shape_(output_shape), @@ -210,13 +212,15 @@ struct Deconv2dFunctorBase { template struct Deconv2dFunctor : Deconv2dFunctorBase { - Deconv2dFunctor(const std::vector &strides, + Deconv2dFunctor(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : Deconv2dFunctorBase(strides, + : Deconv2dFunctorBase(context, + strides, padding_type, paddings, output_shape, @@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct Deconv2dFunctor : Deconv2dFunctorBase { - Deconv2dFunctor(const std::vector &strides, + Deconv2dFunctor(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : Deconv2dFunctorBase(strides, + : Deconv2dFunctorBase(context, + strides, padding_type, paddings, output_shape, diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index c0e0f2670fc2970d4d29cdb9ae4680e77a607a8d..7c4a7456a122b7028360ab560117ed7bce0e9a0a 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -19,6 +19,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -29,9 +30,11 @@ namespace mace { namespace kernels { template -struct DepthToSpaceOpFunctor { - explicit DepthToSpaceOpFunctor(const int block_size, bool d2s) - : block_size_(block_size), d2s_(d2s) {} +struct DepthToSpaceOpFunctor : OpKernel { + DepthToSpaceOpFunctor(OpKernelContext *context, + const int block_size, + bool d2s) + : OpKernel(context), block_size_(block_size), d2s_(d2s) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor { #ifdef MACE_ENABLE_OPENCL template -struct DepthToSpaceOpFunctor { - DepthToSpaceOpFunctor(const int block_size, bool d2s) - : block_size_(block_size), d2s_(d2s) {} +struct DepthToSpaceOpFunctor : OpKernel { + DepthToSpaceOpFunctor(OpKernelContext *context, + const int block_size, + bool d2s) + : OpKernel(context), block_size_(block_size), d2s_(d2s) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 9304b14f711f184616d42228cca0713b487f7511..3b2eb70bc78d26e586fba777945e4a674250df10 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -37,14 +37,16 @@ namespace mace { namespace kernels { -struct DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctorBase(const int *strides, +struct DepthwiseConv2dFunctorBase : OpKernel { + DepthwiseConv2dFunctorBase(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), dilations_(dilations), @@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor; template<> struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor template<> struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor const int32_t *bias_data = nullptr; if (bias == nullptr) { zero_bias.reset( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); + new Tensor(GetCPUAllocator(), DT_INT32)); zero_bias->Resize(bias_shape); zero_bias->Clear(); bias_data = zero_bias->data(); @@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor template struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 42d220fa2dd5a7c6bbd39052dd6a99960d24cda5..9e9a2be8f9e17cde08665e29c5debce4275e4eb2 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } -struct EltwiseFunctorBase { - EltwiseFunctorBase(const EltwiseType type, +struct EltwiseFunctorBase : OpKernel { + EltwiseFunctorBase(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, const int32_t scalar_input_index, const DataFormat data_format) - : type_(type), + : OpKernel(context), + type_(type), coeff_(coeff), scalar_input_(scalar_input), scalar_input_index_(scalar_input_index), @@ -823,12 +826,14 @@ struct EltwiseFunctorBase { template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, + EltwiseFunctor(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, // float as it comes from arg const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(type, + : EltwiseFunctorBase(context, + type, coeff, scalar_input, scalar_input_index, @@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, + EltwiseFunctor(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(type, + : EltwiseFunctorBase(context, + type, coeff, scalar_input, scalar_input_index, diff --git a/mace/kernels/fill.h b/mace/kernels/fill.h index b534a1839c77d183441e9cff74c1de6a917fa648..131dd9d4bffc8f851dd22e1f1a1603defc3d5bb2 100644 --- a/mace/kernels/fill.h +++ b/mace/kernels/fill.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { @@ -30,8 +31,8 @@ template struct FillFunctor; template <> -struct FillFunctor { - FillFunctor() {} +struct FillFunctor : OpKernel { + explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *shape, const Tensor *value, diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index e5172920a2a3c08862948257debff0387da10a0c..e6743aa4475777310d9c93183bef8031fd1931c8 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -27,10 +27,12 @@ namespace mace { namespace kernels { -struct FullyConnectedBase { - FullyConnectedBase(const ActivationType activation, +struct FullyConnectedBase : OpKernel { + FullyConnectedBase(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : activation_(activation), + : OpKernel(context), + activation_(activation), relux_max_limit_(relux_max_limit) {} const ActivationType activation_; @@ -42,9 +44,10 @@ struct FullyConnectedFunctor; template <> struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, @@ -86,9 +89,10 @@ struct FullyConnectedFunctor: FullyConnectedBase { template <> struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, @@ -117,7 +121,7 @@ struct FullyConnectedFunctor: FullyConnectedBase { const int32_t *bias_ptr = nullptr; if (bias == nullptr) { zero_bias.reset( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); + new Tensor(GetCPUAllocator(), DT_INT32)); zero_bias->Resize(bias_shape); zero_bias->Clear(); bias_ptr = zero_bias->data(); @@ -148,9 +152,10 @@ struct FullyConnectedFunctor: FullyConnectedBase { #ifdef MACE_ENABLE_OPENCL template struct FullyConnectedFunctor : FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, diff --git a/mace/kernels/gather.h b/mace/kernels/gather.h index 101a60e3f9913c3ab14f7ba4e952390e832e7768..ddfa14d1d9b978e130162ee58d533810ae18cfec 100644 --- a/mace/kernels/gather.h +++ b/mace/kernels/gather.h @@ -21,13 +21,15 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { -struct GatherBase { - explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {} +struct GatherBase : OpKernel { + GatherBase(OpKernelContext *context, int axis, float y) + : OpKernel(context), axis_(axis), y_(y) {} int axis_; float y_; @@ -38,7 +40,8 @@ struct GatherFunctor; template <> struct GatherFunctor : GatherBase { - explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {} + GatherFunctor(OpKernelContext *context, int axis, float y) + : GatherBase(context, axis, y) {} MaceStatus operator()(const Tensor *params, const Tensor *indices, diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc index c94c0af5e900e7414d452e35630b8c6f623418b7..5043a1041fea0fffeb0661651244b7b8c1899771 100644 --- a/mace/kernels/gemm.cc +++ b/mace/kernels/gemm.cc @@ -1341,8 +1341,8 @@ void Gemm(const float *A, ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k); const index_t ik_end = std::min(K, ik_begin + this_block_size_k); - Tensor trans_a; - Tensor trans_b; + Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT); const float *real_a = nullptr; const float *real_b = nullptr; float *real_c = c_base + (ih_begin * width + iw_begin); @@ -1399,8 +1399,8 @@ void GemmRef(const float *A, const bool transpose_b) { memset(C, 0, sizeof(float) * batch * height * width); - Tensor trans_a; - Tensor trans_b; + Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT); float *trans_a_data = nullptr; float *trans_b_data = nullptr; if (transpose_a) { diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index 4e6b057f78520bbb05b18599482ff04a24e407c2..c4394fda15e95c2c65af625ed0e711af4391be6b 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -20,21 +20,24 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { -struct ImageToBufferFunctorBase { - explicit ImageToBufferFunctorBase(const int wino_blk_size) - : wino_blk_size_(wino_blk_size) {} +struct ImageToBufferFunctorBase : OpKernel { + ImageToBufferFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), + wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - explicit ImageToBufferFunctor(const int wino_blk_size) - : ImageToBufferFunctorBase(wino_blk_size) {} + ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size) + : ImageToBufferFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - explicit ImageToBufferFunctor(const int wino_blk_size) - : ImageToBufferFunctorBase(wino_blk_size) {} + ImageToBufferFunctor(OpKernelContext *context, + const int wino_blk_size) + : ImageToBufferFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/kernel.h b/mace/kernels/kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..853e974f76a5667c326c85346bfd3ba274b2cd9f --- /dev/null +++ b/mace/kernels/kernel.h @@ -0,0 +1,31 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_KERNEL_H_ +#define MACE_KERNELS_KERNEL_H_ + +#include "mace/core/op_kernel_context.h" + +namespace mace { +namespace kernels { + +struct OpKernel { + explicit OpKernel(OpKernelContext *context): context_(context) {} + + OpKernelContext *context_; +}; + +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_KERNEL_H_ diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.h index 0af86327abc120b7a31348c9a3f393437466faf4..d9eeb7db43d8d8864f3f4a5a2d708fc600adfcab 100644 --- a/mace/kernels/local_response_norm.h +++ b/mace/kernels/local_response_norm.h @@ -21,7 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/public/mace.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -34,7 +34,9 @@ template struct LocalResponseNormFunctor; template<> -struct LocalResponseNormFunctor { +struct LocalResponseNormFunctor : OpKernel { + explicit LocalResponseNormFunctor(OpKernelContext *context) + : OpKernel(context) {} MaceStatus operator()(const Tensor *input, int depth_radius, float bias, diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h index 46439fae1f269abf21f53fe3ac75ba67df6406be..cb6b86fdd2959067b9d5c53bc69cdb325b286d2e 100644 --- a/mace/kernels/lstmcell.h +++ b/mace/kernels/lstmcell.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #if defined(MACE_ENABLE_NEON) #include @@ -35,9 +36,10 @@ template struct LSTMCellFunctor; template -struct LSTMCellFunctor { - explicit LSTMCellFunctor(T forget_bias) : - forget_bias_(static_cast(forget_bias)) {} +struct LSTMCellFunctor : OpKernel{ + LSTMCellFunctor(OpKernelContext *context, T forget_bias) + : OpKernel(context), + forget_bias_(static_cast(forget_bias)) {} MaceStatus operator()(const Tensor *input, const Tensor *pre_output, const Tensor *weight, diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 42e76002a231d3b0b5ebc38d3df0bacf0cc265a0..4b6c5cf1ef8309281178fd52f545556b87a80190 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -29,6 +29,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/gemm.h" +#include "mace/kernels/kernel.h" #include "mace/utils/utils.h" #include "mace/kernels/gemmlowp_util.h" @@ -40,7 +41,8 @@ namespace mace { namespace kernels { template -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *A, const Tensor *B, Tensor *C, @@ -87,7 +89,7 @@ struct MatMulFunctor { // A * B = (B^T * A^T)^T if (!transpose_b) { if (B_transpose_.get() == nullptr) { - B_transpose_.reset(new Tensor(GetDeviceAllocator(D), + B_transpose_.reset(new Tensor(context_->device()->allocator(), DataTypeToEnum::v())); B_transpose_->Resize({batch, width, K}); Tensor::MappingGuard guardbt(B_transpose_.get()); @@ -112,7 +114,8 @@ struct MatMulFunctor { }; template <> -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} template void MatMulImpl(const Tensor *A, const Tensor *B, @@ -208,7 +211,8 @@ struct MatMulFunctor { #ifdef MACE_ENABLE_OPENCL template -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *A, const Tensor *B, Tensor *C, diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 2cd0c2a3868357946ccb73979fb0e4b4c1391a06..7757758c379b82ccfc8238da9960d46eed50380a 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -33,11 +33,11 @@ MaceStatus ActivationFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); @@ -94,12 +94,12 @@ MaceStatus ActivationFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, - lws, future)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index f01baa7170dbbf3c907ac38ab6d45bd600e50a31..7c1c1afc669b3de85055aac01ea9f96d9cf007ec 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -34,7 +34,7 @@ MaceStatus AddNFunctor::operator()( const index_t width = input_tensors[0]->dim(2); const index_t channels = input_tensors[0]->dim(3); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); for (size_t i = 1; i < size; ++i) { MACE_CHECK_NOTNULL(input_tensors[i]); @@ -49,7 +49,7 @@ MaceStatus AddNFunctor::operator()( MACE_NOT_IMPLEMENTED; } std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); @@ -96,7 +96,7 @@ MaceStatus AddNFunctor::operator()( std::string tuning_key = Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index e26065d9d340022455b585e55c953aab8c307e5c..446a26cc034bc9536d2495fc89c91d8174804f06 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor::operator()( static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); @@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("batch_norm_opencl_kernel", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index aaa0d17203c40dbd177e5a42956b8d9d3078c9f2..eae22c0074c8205c69fc7741274b09700e94d6f1 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); @@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); cl::Event event; cl_int error; diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index c95ef0ade2789f880cb563ee2d0103c7de4abf6f..75d0c4f542a11feda4e615ff025d0d771931008b 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor::operator()( } } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index d74346832d9ff41af251decbdd0e113a408c9f62..64de09c2d597b5fe5bd2bf4923c442426c43ce8c 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor::operator()( static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); @@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 58b27faa92ff1a3979249cbe9a552ff4f58323c6..6fa4ba8fedf2abc4365a2b83b16c0d90a51aeae0 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -22,13 +22,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(base, kwg_size / lws[1]); @@ -41,7 +43,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -static MaceStatus Concat2(cl::Kernel *kernel, +static MaceStatus Concat2(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, const DataType dt, @@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel, static_cast(batch * height), }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); @@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel, *prev_input_shape = input0->shape(); } - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("concat_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } -static MaceStatus ConcatN(cl::Kernel *kernel, +static MaceStatus ConcatN(OpKernelContext *context, + cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, @@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel, const index_t height = output->dim(1); const index_t width = output->dim(2); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); @@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, static_cast(input_channel_blk), static_cast(width), static_cast(batch * height), }; - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); uint32_t idx = 0; OUT_OF_RANGE_SET_ARG_PTR; @@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel, for (size_t j = 0; j < 3; ++j) { roundup_gws[j] = RoundUp(gws[j], lws[j]); } - const std::vector lws = LocalWS(gws, *kwg_size); - error = runtime->command_queue().enqueueNDRangeKernel( *kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), @@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, } } if (future != nullptr) { - future->wait_fn = [runtime, call_stats](CallStats *stats) { + future->wait_fn = [call_stats](CallStats *stats) { if (stats != nullptr) { stats->start_micros = call_stats.start_micros; stats->end_micros = stats->start_micros + call_stats.end_micros; @@ -234,12 +236,14 @@ MaceStatus ConcatFunctor::operator()( switch (inputs_count) { case 2: - return Concat2(&kernel_, input_list[0], input_list[1], + return Concat2(context_, + &kernel_, input_list[0], input_list[1], DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); default: if (divisible_four) { - return ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, + return ConcatN(context_, + &kernel_, input_list, DataTypeToEnum::value, output, future, &kwg_size_, &kernel_error_); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc index 6221382e7e8e9d9290777379d7d77832b17b8e40..bc8538b77e9f9de56a6e51cdbdbcd905ff8f2a50 100644 --- a/mace/kernels/opencl/conv_2d.cc +++ b/mace/kernels/opencl/conv_2d.cc @@ -18,7 +18,8 @@ namespace mace { namespace kernels { -extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, uint32_t *kwg_size, std::unique_ptr *kernel_error); -extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, uint32_t *kwg_size, std::unique_ptr *kernel_error); -extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, +extern MaceStatus Conv2dOpencl(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, Tensor *output, StatsFuture *future) { typedef MaceStatus (*Conv2dOpenclFunction)( - cl::Kernel * kernel, const Tensor *input, const Tensor *filter, - const Tensor *bias, const int stride, const int *padding, - const int *dilations, const ActivationType activation, + OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input, + const Tensor *filter, const Tensor *bias, const int stride, + const int *padding, const int *dilations, + const ActivationType activation, const float relux_max_limit, const DataType dt, std::vector *input_shape, Tensor *output, StatsFuture *future, uint32_t *kwg_size, std::unique_ptr *kernel_error); @@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, if (kernel_h == kernel_w && kernel_h <= 3 && selector[kernel_h - 1] != nullptr) { auto conv2d_func = selector[kernel_h - 1]; - return conv2d_func( + return conv2d_func(context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); } else { - return Conv2dOpencl( + return Conv2dOpencl(context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index 770f0606d4152c6ad7e65f92c94246487571417a..c43c045019ba1a2e9cde11cd9288159f36ed45ec 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -25,14 +25,16 @@ namespace { const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; // TODO(liuqi): Fix the specific value. const uint32_t lws_limit = 128; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + cache_size = runtime->device_global_mem_cache_size(); + uint32_t compute_units = runtime->device_compute_units(); const uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); @@ -62,7 +64,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { MACE_CHECK(input_batch == batch); std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); @@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - std::vector lws = LocalWS(gws, *kwg_size); + std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 02df4ea166abd8f80bd77e9cc6c91754a30503e8..c0362831658ccd327ec0407bdc9f4ff05d40cf1c 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -24,15 +24,17 @@ namespace kernels { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t compute_units = std::max( - OpenCLRuntime::Global()->device_compute_units() / 2, 1); + runtime->device_compute_units() / 2, 1); const uint32_t base = std::max( std::min(cache_size / kBaseGPUMemCacheSize, 4), 1); @@ -55,7 +57,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv(width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); @@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - std::vector lws = LocalWS(gws, *kwg_size); + std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index fa2c9774b607652c3ca307239d5baae33aeac699..bac1da8f40e0c8ad2a75d328730ee9f0f495319b 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -26,7 +26,8 @@ namespace { const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; // TODO(liuqi): Fix the specific value. const uint32_t lws_limit = 20; -std::vector LocalWS(const uint32_t *gws, +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kernel_size, const uint32_t kwg_size) { std::vector lws(4, 0); @@ -34,8 +35,8 @@ std::vector LocalWS(const uint32_t *gws, lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + cache_size = runtime->device_global_mem_cache_size(); + uint32_t compute_units = runtime->device_compute_units(); const uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); @@ -64,7 +65,8 @@ std::vector LocalWS(const uint32_t *gws, } // namespace -extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, +extern MaceStatus Conv2dOpencl(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); @@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3), filter->dim(2), filter->dim(3)); std::vector lws = - LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index 651b2ef87a544ca6f682aedb3e8a2c1ae3bd4bf1..fce91d2be483d62570fec85bd91515f8bb89e8d5 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -22,13 +22,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(base, kwg_size / lws[1]); @@ -132,11 +134,11 @@ MaceStatus CropFunctor::operator()( static_cast(output->dim(0) * output->dim(1)) }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); built_options.emplace("-Dcrop=" + kernel_name); @@ -167,11 +169,11 @@ MaceStatus CropFunctor::operator()( input_shape_ = input0->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("crop_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc index cba8cbceaeb5fafdd7410250b0b36c6238706479..197b305e7e80b10d121c883d417c59a71d2abd9e 100644 --- a/mace/kernels/opencl/deconv_2d.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -20,7 +20,8 @@ namespace kernels { namespace { -MaceStatus Deconv2dOpencl(cl::Kernel *kernel, +MaceStatus Deconv2dOpencl(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, const int align_w = stride_w - 1 - padding_w; const int kernel_size = filter->dim(2) * filter->dim(3); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); built_options.emplace("-Ddeconv_2d=" + kernel_name); @@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, *kwg_size); + const std::vector lws = Default3DLocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("deconv2d_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor::operator()( &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(), - paddings.data(), activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future, + return Deconv2dOpencl(context_, &kernel_, input, filter, bias, + strides_.data(), paddings.data(), activation_, + relux_max_limit_, DataTypeToEnum::value, + &input_shape_, output, future, &kwg_size_, &kernel_error_); } diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index 4c1fd3becb1ada46dee96afe50ff56ff728ba0e9..f5427af18d5b37887bb2991f0f51b2731c6e7eff 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 3c97a28845ae09152e6439092db8b78e4f992275..1bc910fdabc5551ff48e431f193ba42346830759 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -24,13 +24,15 @@ namespace kernels { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = cache_size / kBaseGPUMemCacheSize; lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -58,7 +60,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, +static MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, // NHWC const Tensor *filter, // HWIM const Tensor *bias, @@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { @@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor::operator()( index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); if (strides_[0] != strides_[1]) { - LOG(WARNING) << "OpenCL depthwise conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << " is not implemented yet, using slow version"; - // TODO(heliangliang) The CPU/NEON kernel should map the buffer - return DepthwiseConv2dFunctor( - strides_, padding_type_, paddings_, dilations_, activation_, - relux_max_limit_)(input, filter, bias, output, future); + LOG(FATAL) << "GPU depthwise conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides_[0] << "x" << strides_[1] + << " is not implemented yet."; } // Create a fake conv_2d filter to calculate the paddings and output size @@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor::operator()( MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); return DepthwiseConv2d( + context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 9eedf011009ee8acbb97a00a84da5e140f11fa8c..201639e31bad24abc3c61053596b89d5fc7a25d7 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, static_cast(width), static_cast(batch_height_pixels)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); @@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, input_shape_ = input0->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index dc8798a53c816b7a03153ae8dd2604d02dd7de67..2af592c761976cad0414d2b16288b09b99a48d49 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -22,7 +22,8 @@ namespace kernels { namespace { template -MaceStatus FCWXKernel(cl::Kernel *kernel, +MaceStatus FCWXKernel(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *weight, const Tensor *bias, @@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, std::unique_ptr *kernel_error) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { const index_t batch = output->dim(0); @@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, const index_t output_blocks = RoundUpDiv4(output_size); std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); @@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, } template -MaceStatus FCWTXKernel(cl::Kernel *kernel, +MaceStatus FCWTXKernel(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *weight, const Tensor *bias, @@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, std::unique_ptr *kernel_error) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, std::string tuning_key = Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws->data(), *lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor::operator()( &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - return FCWXKernel(&kernel_, input, weight, bias, &input_shape_, output, + return FCWXKernel(context_, + &kernel_, input, weight, bias, &input_shape_, output, activation_, &gws_, &lws_, relux_max_limit_, future, &kernel_error_); } diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 6ef80c80d1e21b9e8a3c0e93b1721d50ccc46d00..aa3daadbd69f28975ec4ae75aba34ca78f595a69 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) { } } -std::vector Default3DLocalWS(const uint32_t *gws, +std::vector Default3DLocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { - uint64_t cache_size = - OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint64_t cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[2] = @@ -245,13 +245,12 @@ std::vector Default3DLocalWS(const uint32_t *gws, return lws; } -MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); - auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, } return error; }; - OpenCLProfilingTimer timer(&event); - cl_int err = Tuner::Get()->template TuneOrRun( + OpenCLProfilingTimer timer(runtime, &event); + cl_int err = runtime->tuner()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); MACE_CL_RET_STATUS(err); if (future != nullptr) { - future->wait_fn = [event](CallStats *stats) { + future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { - OpenCLRuntime::Global()->GetCallStats(event, stats); + runtime->GetCallStats(event, stats); } }; } return MaceStatus::MACE_SUCCESS; } -MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); - auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, } return error; }; - OpenCLProfilingTimer timer(&event); - cl_int err = Tuner::Get()->template TuneOrRun( + OpenCLProfilingTimer timer(runtime, &event); + cl_int err = runtime->tuner()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); MACE_CL_RET_STATUS(err); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5d4bf4104172ac093212fcb023941e9bb0015b6c..d9e309bc2c19045ffcd9eb4f373fb9dc7b208f61 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -31,11 +31,11 @@ namespace mace { namespace kernels { -#define OUT_OF_RANGE_CONFIG(kernel_error) \ +#define OUT_OF_RANGE_CONFIG(kernel_error, context) \ if (runtime->IsOutOfRangeCheckEnabled()) { \ built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ (kernel_error) = std::move(std::unique_ptr( \ - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \ + new Buffer((context)->device()->allocator()))); \ MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ (kernel_error)->Map(nullptr); \ *((kernel_error)->mutable_data()) = 0; \ @@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpCompatibleCLDt(const DataType dt); // Tuning or Run OpenCL kernel with 3D work group size -MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); // Tuning or Run OpenCL kernel with 2D work group size -MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, @@ -162,7 +164,8 @@ std::string Concat(Args... args) { return ss.str(); } -std::vector Default3DLocalWS(const uint32_t *gws, +std::vector Default3DLocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kwg_size); } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index 955b9ebebd3fcb1d3bc48f04de7617e5b10e43cb..b98e9fb2ac77ee785ac21f17360fa998b37f537f 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor::operator()( break; } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc index ffc185d0dc84b2019c473827e8d02edc141e1482..6704c0b457d876c28a590860e6ad866ff24228ad 100644 --- a/mace/kernels/opencl/lstmcell.cc +++ b/mace/kernels/opencl/lstmcell.cc @@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor::operator()( const index_t width = input->dim(1); const index_t width_blocks = width / 4; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); @@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor::operator()( const std::vector lws = {kwg_size_ / 16, 16, 0}; std::string tuning_key = Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 4df9d58d74098bd6aae2e9697333090f05e215ba..407b455d106ad765f4731f98caa948527d1a2129 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -53,11 +53,11 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, static_cast(height_blocks * batch), }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); @@ -84,7 +84,7 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, const std::vector lws = {kwg_size_ / 64, 64, 0}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index d257fea2d7fca9333c8d997e7703f53345feba2a..03f05ca5f5711edb327fd9a14e5de79a21eb074c 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -16,6 +16,8 @@ #include #include "gtest/gtest.h" +#include "mace/core/op_kernel_context.h" +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" @@ -25,14 +27,15 @@ namespace mace { namespace kernels { namespace { -bool BufferToImageOpImpl(Tensor *buffer, +bool BufferToImageOpImpl(OpKernelContext *context, + Tensor *buffer, Tensor *image, const std::vector &image_shape) { std::unique_ptr kernel_error; uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); std::string kernel_name = "in_out_buffer_to_image"; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); @@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer, std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - OUT_OF_RANGE_CONFIG(kernel_error); + OUT_OF_RANGE_CONFIG(kernel_error, context); NON_UNIFORM_WG_CONFIG; if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + @@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) { index_t width = 7; index_t channels = 11; - std::vector buffer_shape = {batch, height, width, channels}; + GPUContext gpu_context; + std::unique_ptr device(new GPUDevice(gpu_context.opencl_tuner())); + Workspace ws; + OpKernelContext context(&ws, device.get()); + + std::vector buffer_shape = {batch, height, width, channels}; Tensor *buffer = - ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU), + ws.CreateTensor("Buffer", device->allocator(), DataTypeToEnum::v()); buffer->Resize(buffer_shape); std::vector image_shape; - Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU), + Tensor *image = ws.CreateTensor("Image", device->allocator(), DataTypeToEnum::v()); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); image->ResizeImage(buffer->shape(), image_shape); - ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape)); + ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape)); std::vector overflow_image_shape = image_shape; for (size_t i = 0; i < overflow_image_shape.size(); ++i) { overflow_image_shape[i] += 1; } - ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape)); + ASSERT_TRUE(BufferToImageOpImpl(&context, + buffer, + image, + overflow_image_shape)); } } // namespace kernels diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 04e9d69d4aaf8f7a81f2deee644e80cdc4988145..a3f4cfaa53c7b21f95cfcbea219c4fe3853d6a72 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -47,11 +47,11 @@ MaceStatus PadFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); built_options.emplace("-Dpad=" + kernel_name); @@ -85,10 +85,10 @@ MaceStatus PadFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 18eb6e80f9595ac177db70d49f9ac81b822bcbfc..c6743750a3b381e5fcbb632980df50340fa872d2 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -23,13 +23,15 @@ namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[2] = @@ -54,12 +56,12 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); @@ -149,11 +151,11 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, }; } - const std::vector lws = LocalWS(gws.data(), kwg_size_); + const std::vector lws = LocalWS(runtime, gws.data(), kwg_size_); std::string tuning_key = Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws.data(), lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc index 075632c554323d591ab614c45d55717b9bcc44ad..a6a45f764a6e78628a98410ba20a423c58e0c6fd 100644 --- a/mace/kernels/opencl/reduce_mean.cc +++ b/mace/kernels/opencl/reduce_mean.cc @@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); const uint32_t image_size = static_cast(in_height * in_width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); std::vector gws(3); std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; @@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor::operator()( if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); built_options.emplace("-Dreduce_mean=" + kernel_name); diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc index f8a33383e99a2db4978f40a9c17ce9034df30218..6fc26e52d8d7d1dc2a2e0e1229541fd2800dd358 100644 --- a/mace/kernels/opencl/resize_bicubic.cc +++ b/mace/kernels/opencl/resize_bicubic.cc @@ -23,9 +23,11 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint64_t cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor::operator()( static_cast(out_width), static_cast(out_height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { + auto dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); @@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index 0b297dd22dae97f3be1fdf881a42118acf03169c..23e5db1c102979c9f3dbea869016d56ccc359d62 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -23,13 +23,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor::operator()( static_cast(out_width), static_cast(out_height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); @@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index f401b827096189156c184f348f0017ede7dce13f..e84ec7312d6d0e2e5cce33b1253b0ff948af19d5 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -24,13 +24,15 @@ namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (gws[0] < base) { @@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); @@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, input_shape_ = logits->shape(); } - std::vector lws = LocalWS(gws, kwg_size_); + std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("softmax_opencl_kernel", batch, height, width, channels); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index c31b2d691f5fccd72faa75f35ce88f034bd7900f..8794dd2a5ee2cefffaa8cec5b591501b3980c2a8 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor::operator()( chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor::operator()( space_shape_ = space_tensor->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc index 65fd6be530898200e50cc74518813cd01e7c9d15..c445b783564095e5ef27ecfc486fbe79b0cc1548 100644 --- a/mace/kernels/opencl/split.cc +++ b/mace/kernels/opencl/split.cc @@ -40,11 +40,11 @@ MaceStatus SplitFunctor::operator()( output_list[i]->ResizeImage(output_shape, image_shape)); } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); built_options.emplace("-Dsplit=" + kernel_name); @@ -66,7 +66,7 @@ MaceStatus SplitFunctor::operator()( static_cast(input->dim(0) * input->dim(1)), }; - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); cl::Event event; CallStats call_stats{INT64_MAX, 0}; for (size_t i = 0; i < outputs_count; ++i) { diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 74d8776fa089168c87ea7b1751244d3151e28492..43210171a743bdd7dd5640ccaf2415c23bacd553 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -24,12 +24,12 @@ namespace kernels { template MaceStatus WinogradTransformFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = @@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor::operator()( output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); @@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( const std::vector &inputs, Tensor *output_tensor, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); const Tensor *input_tensor = inputs[0]; const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; @@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = @@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3), input_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h index de851bb7093781bea137b89c04a991323809af29..14a4c8d6f4b7438709f1af05d776bec7cb273883 100644 --- a/mace/kernels/pad.h +++ b/mace/kernels/pad.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -29,10 +30,13 @@ namespace mace { namespace kernels { -struct PadFunctorBase { - PadFunctorBase(const std::vector &paddings, +struct PadFunctorBase : OpKernel { + PadFunctorBase(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : paddings_(paddings), constant_value_(constant_value) {} + : OpKernel(context), + paddings_(paddings), + constant_value_(constant_value) {} std::vector paddings_; float constant_value_; @@ -40,9 +44,10 @@ struct PadFunctorBase { template struct PadFunctor : public PadFunctorBase { - PadFunctor(const std::vector &paddings, + PadFunctor(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : PadFunctorBase(paddings, constant_value) {} + : PadFunctorBase(context, paddings, constant_value) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct PadFunctor : PadFunctorBase { - PadFunctor(const std::vector &paddings, + PadFunctor(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : PadFunctorBase(paddings, constant_value) {} + : PadFunctorBase(context, paddings, constant_value) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 94a388bec7227d3d39f0caa60fddb30a13c059ae..c61745284b2288278be0d9c95076a9ab76af45cb 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/kernel.h" #if defined(MACE_ENABLE_NEON) #include @@ -41,14 +42,16 @@ enum PoolingType { namespace kernels { -struct PoolingFunctorBase { - PoolingFunctorBase(const PoolingType pooling_type, +struct PoolingFunctorBase : OpKernel { + PoolingFunctorBase(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : pooling_type_(pooling_type), + : OpKernel(context), + pooling_type_(pooling_type), kernels_(kernels), strides_(strides), padding_type_(padding_type), @@ -68,14 +71,20 @@ struct PoolingFunctor; template <> struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) { } void MaxPooling(const float *input, @@ -231,15 +240,20 @@ struct PoolingFunctor: PoolingFunctorBase { template <> struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { - } + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) {} void MaxPooling(const uint8_t *input, const index_t *in_shape, @@ -443,14 +457,20 @@ struct PoolingFunctor: PoolingFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct PoolingFunctor : PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) { } MaceStatus operator()(const Tensor *input_tensor, Tensor *output_tensor, diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h index 89f79b7fa702a05de8b7d781c380420915a2ca20..aa002988a53f3145f945b145432da2d21ae34f01 100644 --- a/mace/kernels/proposal.h +++ b/mace/kernels/proposal.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { @@ -121,8 +122,9 @@ inline std::vector nms(const float *bboxes_ptr, template -struct ProposalFunctor { - ProposalFunctor(const int min_size, +struct ProposalFunctor : OpKernel { + ProposalFunctor(OpKernelContext *context, + const int min_size, const float nms_thresh, const int pre_nms_top_n, const int post_nms_top_n, @@ -130,6 +132,7 @@ struct ProposalFunctor { const int base_size, const std::vector &scales, const std::vector &ratios) : + OpKernel(context), min_size_(min_size), thresh_(nms_thresh), pre_nms_top_n_(pre_nms_top_n), diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h index 1f1cb8d113ef4e6b996ed97ddacc54097041d023..fe52e8d8a6f60b63b34f4cb25be5b73f3214a211 100644 --- a/mace/kernels/quantize.h +++ b/mace/kernels/quantize.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { @@ -173,8 +174,8 @@ template struct QuantizeFunctor; template<> -struct QuantizeFunctor { - QuantizeFunctor() {} +struct QuantizeFunctor : OpKernel { + explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const bool non_zero, @@ -212,8 +213,8 @@ template struct DequantizeFunctor; template<> -struct DequantizeFunctor { - DequantizeFunctor() {} +struct DequantizeFunctor : OpKernel { + explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.h index 65dc67d91c07e3b6f663f67f1135417f2accbc59..71fc2de028e207249cc95c48d82699f77d6c353f 100644 --- a/mace/kernels/reduce_mean.h +++ b/mace/kernels/reduce_mean.h @@ -24,6 +24,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" #endif @@ -31,10 +32,12 @@ namespace mace { namespace kernels { -struct ReduceFunctorBase { - ReduceFunctorBase(const std::vector &axis, +struct ReduceFunctorBase : OpKernel { + ReduceFunctorBase(OpKernelContext *context, + const std::vector &axis, const bool keep_dims) - : keep_dims_(keep_dims), + : OpKernel(context), + keep_dims_(keep_dims), axis_(axis) {} bool keep_dims_; bool reduce_first_axis_; @@ -44,10 +47,11 @@ struct ReduceFunctorBase { }; template -struct ReduceMeanFunctor : ReduceFunctorBase{ - ReduceMeanFunctor(const std::vector &axis, +struct ReduceMeanFunctor : ReduceFunctorBase { + ReduceMeanFunctor(OpKernelContext *context, + const std::vector &axis, const bool keep_dims) - : ReduceFunctorBase(axis, keep_dims) {} + : ReduceFunctorBase(context, axis, keep_dims) {} void Simplify(const Tensor *input) { std::vector bitmap(static_cast(input->dim_size()), false); @@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{ template struct ReduceMeanFunctor : ReduceFunctorBase { - ReduceMeanFunctor(const std::vector axis, + ReduceMeanFunctor(OpKernelContext *context, + const std::vector axis, const bool keep_dims) - : ReduceFunctorBase(axis, keep_dims) {} + : ReduceFunctorBase(context, axis, keep_dims) {} MaceStatus operator()(const Tensor *input, Tensor *output_tensor, diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h index cfa7bb2e94012d6cd6cbd78bc81fd31df6472555..f0ab1bf583b226950a9382e3d5b7a78dfa388c0b 100644 --- a/mace/kernels/reshape.h +++ b/mace/kernels/reshape.h @@ -19,17 +19,14 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { template -struct ReshapeFunctor { - ReshapeFunctor() {} +struct ReshapeFunctor : OpKernel { + explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const std::vector &out_shape, diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index b620b51d70822190d74e531d017a7be54c501d74..7245804154910b714e37a63315a7afc4ce40bf22 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/utils/logging.h" #ifdef MACE_ENABLE_OPENCL @@ -137,10 +138,11 @@ inline void ResizeImage(const float *images, } } -struct ResizeBicubicFunctorBase { - ResizeBicubicFunctorBase(const std::vector &size, +struct ResizeBicubicFunctorBase : OpKernel { + ResizeBicubicFunctorBase(OpKernelContext *context, + const std::vector &size, bool align_corners) - : align_corners_(align_corners) { + : OpKernel(context), align_corners_(align_corners) { MACE_CHECK(size.size() == 2); out_height_ = size[0]; out_width_ = size[1]; @@ -158,8 +160,10 @@ struct ResizeBicubicFunctor; template<> struct ResizeBicubicFunctor : ResizeBicubicFunctorBase { - ResizeBicubicFunctor(const std::vector &size, bool align_corners) - : ResizeBicubicFunctorBase(size, align_corners) {} + ResizeBicubicFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBicubicFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -204,8 +208,10 @@ struct ResizeBicubicFunctor template struct ResizeBicubicFunctor : ResizeBicubicFunctorBase { - ResizeBicubicFunctor(const std::vector &size, bool align_corners) - : ResizeBicubicFunctorBase(size, align_corners) {} + ResizeBicubicFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBicubicFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index cb41ef451dcbf25265227c3005b5532759afdced..92e57b4fde5fa39b0a5ae2801b4077633731eae7 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -113,10 +114,12 @@ inline void ResizeImage(const float *images, } } -struct ResizeBilinearFunctorBase { - ResizeBilinearFunctorBase(const std::vector &size, +struct ResizeBilinearFunctorBase : OpKernel { + ResizeBilinearFunctorBase(OpKernelContext *context, + const std::vector &size, bool align_corners) - : align_corners_(align_corners) { + : OpKernel(context), + align_corners_(align_corners) { MACE_CHECK(size.size() == 2); out_height_ = size[0]; out_width_ = size[1]; @@ -134,8 +137,10 @@ struct ResizeBilinearFunctor; template<> struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { - ResizeBilinearFunctor(const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(size, align_corners) {} + ResizeBilinearFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBilinearFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -187,8 +192,10 @@ struct ResizeBilinearFunctor template struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { - ResizeBilinearFunctor(const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(size, align_corners) {} + ResizeBilinearFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBilinearFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/scalar_math.h b/mace/kernels/scalar_math.h index 604302074cf30e6b03c1b0c2ded96b2596696b62..928a4954b0b203ea2c338d07713ae411ea91dd17 100644 --- a/mace/kernels/scalar_math.h +++ b/mace/kernels/scalar_math.h @@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0, template -struct ScalarMathFunctor { - explicit ScalarMathFunctor(const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index) - : type_(type), +struct ScalarMathFunctor : OpKernel { + ScalarMathFunctor(OpKernelContext *context, + const EltwiseType type, + const std::vector &coeff, + const float scalar_input, + const int32_t scalar_input_index) + : OpKernel(context), + type_(type), coeff_(coeff), scalar_input_(scalar_input), scalar_input_index_(scalar_input_index) {} diff --git a/mace/kernels/sgemm.h b/mace/kernels/sgemm.h index 15cec1dd9779166f3c220d3c4f589296ae48d706..3aaf5d478324ed8ec4d32452ceeb39422d89ac1f 100644 --- a/mace/kernels/sgemm.h +++ b/mace/kernels/sgemm.h @@ -89,7 +89,7 @@ typedef Major PackOrder; template class PackedBlock { public: - PackedBlock() : data_tensor_(GetDeviceAllocator(CPU), + PackedBlock() : data_tensor_(GetCPUAllocator(), DataTypeToEnum::v()) {} const T *data() { diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index 5de3ade1ec34b1a71e884bd02436a6cd11b0022a..0c2c91268f4d904daddfe401a166ae8b21a0e7eb 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -27,6 +27,7 @@ #include "mace/utils/utils.h" #include "mace/kernels/fixpoint.h" #include "mace/kernels/gemmlowp_util.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/quantize.h" #ifdef MACE_ENABLE_OPENCL @@ -40,7 +41,8 @@ template struct SoftmaxFunctor; template<> -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6; static const int kSumExpIntBits = 12; template<> -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -354,7 +357,8 @@ struct SoftmaxFunctor { #ifdef MACE_ENABLE_OPENCL template -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *logits, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 786e270a41c30cfa7536725d379f2ed652b50ebc..7670632a2620c1d2552097127251ee5d850047d9 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -21,7 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/public/mace.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -30,11 +30,13 @@ namespace mace { namespace kernels { -struct SpaceToBatchFunctorBase { - SpaceToBatchFunctorBase(const std::vector &paddings, +struct SpaceToBatchFunctorBase : OpKernel { + SpaceToBatchFunctorBase(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : paddings_(paddings.begin(), paddings.end()), + : OpKernel(context), + paddings_(paddings.begin(), paddings.end()), block_shape_(block_shape.begin(), block_shape.end()), b2s_(b2s) { MACE_CHECK( @@ -135,10 +137,11 @@ struct SpaceToBatchFunctor; template<> struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(const std::vector &paddings, + SpaceToBatchFunctor(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} + : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, @@ -319,10 +322,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(const std::vector &paddings, + SpaceToBatchFunctor(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} + : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, diff --git a/mace/kernels/split.h b/mace/kernels/split.h index 95ff7861142e3f146f461328d04d1d21f2eb5a51..899e74dac04f9de7c11eb2c3e94f01706b464828 100644 --- a/mace/kernels/split.h +++ b/mace/kernels/split.h @@ -22,6 +22,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -31,15 +32,17 @@ namespace mace { namespace kernels { -struct SplitFunctorBase { - explicit SplitFunctorBase(const int32_t axis) : axis_(axis) {} +struct SplitFunctorBase : OpKernel { + SplitFunctorBase(OpKernelContext *context, const int32_t axis) + : OpKernel(context), axis_(axis) {} int32_t axis_; }; template struct SplitFunctor : SplitFunctorBase { - explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} + SplitFunctor(OpKernelContext *context, const int32_t axis) + : SplitFunctorBase(context, axis) {} MaceStatus operator()(const Tensor *input, const std::vector &output_list, @@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct SplitFunctor : SplitFunctorBase { - explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} + SplitFunctor(OpKernelContext *context, const int32_t axis) + : SplitFunctorBase(context, axis) {} MaceStatus operator()(const Tensor *input, - const std::vector &output_list, - StatsFuture *future); + const std::vector &output_list, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; diff --git a/mace/kernels/stack.h b/mace/kernels/stack.h index 9a84bed0a4d5fc41670aa4d7c5cdae4aafb9544b..4d465784ed18e73ccb1084c4666e89786002c6ce 100644 --- a/mace/kernels/stack.h +++ b/mace/kernels/stack.h @@ -22,14 +22,16 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct StackFunctor { - explicit StackFunctor(int axis) : axis_(axis) {} +struct StackFunctor : OpKernel { + StackFunctor(OpKernelContext *context, int axis) + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, diff --git a/mace/kernels/strided_slice.h b/mace/kernels/strided_slice.h index a6afb46c56cd2e500899197836b5803583dc6c06..a5d0eb3828d365f3e38ed4d5f4520e3092997eb8 100644 --- a/mace/kernels/strided_slice.h +++ b/mace/kernels/strided_slice.h @@ -21,26 +21,29 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct StridedSliceFunctor { - StridedSliceFunctor(int begin_mask, +struct StridedSliceFunctor : OpKernel { + StridedSliceFunctor(OpKernelContext *context, + int begin_mask, int end_mask, int ellipsis_mask, int new_axis_mask, int shrink_axis_mask, bool is_slice) - : begin_mask_(begin_mask), + : OpKernel(context), + begin_mask_(begin_mask), end_mask_(end_mask), ellipsis_mask_(ellipsis_mask), new_axis_mask_(new_axis_mask), shrink_axis_mask_(shrink_axis_mask), is_slice_(is_slice), - tmp_strides_tensor_(GetDeviceAllocator(D), + tmp_strides_tensor_(context->device()->allocator(), DataTypeToEnum::v()) {} MaceStatus operator()(const Tensor *input, diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h index 8de796aa9259474639c31c37b60a7d6f1439710d..87f9c0e2ab1e9115b520f68ff248b85cbede06e8 100644 --- a/mace/kernels/transpose.h +++ b/mace/kernels/transpose.h @@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input, } template -struct TransposeFunctor { - explicit TransposeFunctor(const std::vector &dims) : dims_(dims) {} +struct TransposeFunctor : OpKernel { + TransposeFunctor(OpKernelContext *context, const std::vector &dims) + : OpKernel(context), dims_(dims) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/unstack.h b/mace/kernels/unstack.h index 82b5c467c69180366483672d13eb1e1c9c2a936f..b193c6b5a96455bf670983eb08e505790ad6afee 100644 --- a/mace/kernels/unstack.h +++ b/mace/kernels/unstack.h @@ -22,14 +22,16 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct UnstackFunctor { - explicit UnstackFunctor(int axis) : axis_(axis) {} +struct UnstackFunctor : OpKernel { + UnstackFunctor(OpKernelContext *context, int axis) + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const Tensor *input, const std::vector &outputs, diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index c7d6fc1aaf681d6a02e33dfc374da4dadcf6e6fb..c2e267c480f59118c33380aaf342d04ae37f3b3d 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -30,11 +30,13 @@ namespace mace { namespace kernels { -struct WinogradTransformFunctorBase { - WinogradTransformFunctorBase(const Padding &padding_type, +struct WinogradTransformFunctorBase : OpKernel { + WinogradTransformFunctorBase(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : strides_({1, 1}), + : OpKernel(context), + strides_({1, 1}), dilations_({1, 1}), padding_type_(padding_type), paddings_(paddings), @@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase { template struct WinogradTransformFunctor : WinogradTransformFunctorBase { - WinogradTransformFunctor(const Padding &padding_type, + WinogradTransformFunctor(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} + : WinogradTransformFunctorBase(context, + padding_type, + paddings, + block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase { template struct WinogradTransformFunctor : WinogradTransformFunctorBase { - WinogradTransformFunctor(const Padding &padding_type, + WinogradTransformFunctor(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} + : WinogradTransformFunctorBase(context, + padding_type, + paddings, + block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -85,11 +95,13 @@ struct WinogradTransformFunctor }; #endif // MACE_ENABLE_OPENCL -struct WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctorBase(const ActivationType activation, +struct WinogradInverseTransformFunctorBase : OpKernel { + WinogradInverseTransformFunctorBase(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) - : wino_blk_size_(block_size), + : OpKernel(context), + wino_blk_size_(block_size), activation_(activation), relux_max_limit_(relux_max_limit) {} @@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase { template struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctor(const ActivationType activation, + WinogradInverseTransformFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) : WinogradInverseTransformFunctorBase( - activation, relux_max_limit, block_size) {} + context, activation, relux_max_limit, block_size) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, @@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { template struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctor(const ActivationType activation, + WinogradInverseTransformFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) : WinogradInverseTransformFunctorBase( - activation, relux_max_limit, block_size) {} + context, activation, relux_max_limit, block_size) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 65d7d03c2debab0f78ff185bf3915a1e0f76039c..80a3594363842db02815f25a0b59d33003a9fc75 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -21,10 +21,12 @@ #include #include "mace/core/net.h" +#include "mace/core/device_context.h" #include "mace/ops/ops_register.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #endif // MACE_ENABLE_OPENCL @@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data, } #ifdef MACE_ENABLE_OPENCL -MaceStatus CheckGPUAvalibility(const NetDef *net_def) { +MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { // Check OpenCL avaliable - auto runtime = OpenCLRuntime::Global(); + auto runtime = device->opencl_runtime(); if (!runtime->is_opencl_avaliable()) { return MaceStatus::MACE_OUT_OF_RESOURCES; } @@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) { } // namespace +class GPUContextBuilder::Impl { + public: + void SetStoragePath(const std::string &path); + + void SetOpenCLBinaryPaths(const std::vector &paths); + + void SetOpenCLParameterPath(const std::string &path); + + std::shared_ptr Finalize(); + + public: + std::string storage_path_; + std::vector opencl_binary_paths_; + std::string opencl_parameter_path_; +}; + +void GPUContextBuilder::Impl::SetStoragePath(const std::string &path) { + storage_path_ = path; +} + +void GPUContextBuilder::Impl::SetOpenCLBinaryPaths( + const std::vector &paths) { + opencl_binary_paths_ = paths; +} + +void GPUContextBuilder::Impl::SetOpenCLParameterPath( + const std::string &path) { + opencl_parameter_path_ = path; +} + +std::shared_ptr GPUContextBuilder::Impl::Finalize() { + return std::shared_ptr(new GPUContext(storage_path_, + opencl_binary_paths_, + opencl_parameter_path_)); +} + +GPUContextBuilder::GPUContextBuilder() : impl_(new GPUContextBuilder::Impl) {} + +GPUContextBuilder::~GPUContextBuilder() = default; + +GPUContextBuilder &GPUContextBuilder::SetStoragePath(const std::string &path) { + impl_->SetStoragePath(path); + return *this; +} + +GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths( + const std::vector &paths) { + impl_->SetOpenCLBinaryPaths(paths); + return *this; +} + +GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath( + const std::string &path) { + impl_->SetOpenCLParameterPath(path); + return *this; +} + +std::shared_ptr GPUContextBuilder::Finalize() { + return impl_->Finalize(); +} + +class MaceEngineConfig::Impl { + public: + explicit Impl(const DeviceType device_type); + ~Impl() = default; + + MaceStatus SetGPUContext(std::shared_ptr context); + + MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); + + MaceStatus SetCPUThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp); + + MaceStatus SetOpenMPThreadAffinity(int num_threads, + const std::vector &cpu_ids); + + inline DeviceType device_type() const { + return device_type_; + } + + inline int num_threads() const { + return num_threads_; + } + + inline std::shared_ptr gpu_context() const { + return gpu_context_; + } + + inline GPUPriorityHint gpu_priority_hint() const { + return gpu_priority_hint_; + } + + inline GPUPerfHint gpu_perf_hint() const { + return gpu_perf_hint_; + } + + private: + DeviceType device_type_; + int num_threads_; + std::shared_ptr gpu_context_; + GPUPriorityHint gpu_priority_hint_; + GPUPerfHint gpu_perf_hint_; +}; + +MaceEngineConfig::Impl::Impl(const DeviceType device_type) + : device_type_(device_type), + num_threads_(-1), + gpu_context_(new GPUContext), + gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW), + gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {} + +MaceStatus MaceEngineConfig::Impl::SetGPUContext( + std::shared_ptr context) { + gpu_context_ = context; + return MACE_SUCCESS; +} + +MaceStatus MaceEngineConfig::Impl::SetGPUHints( + GPUPerfHint perf_hint, + GPUPriorityHint priority_hint) { + gpu_perf_hint_ = perf_hint; + gpu_priority_hint_ = priority_hint; + return MACE_SUCCESS; +} + +MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( + int num_threads, + CPUAffinityPolicy policy, + bool use_gemmlowp) { + num_threads_ = num_threads; + return mace::SetOpenMPThreadsAndAffinityPolicy( + num_threads, policy, use_gemmlowp); +} + +MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids) { + num_threads_ = num_threads; + return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids); +} + + +MaceEngineConfig::MaceEngineConfig( + const DeviceType device_type) + : impl_(new MaceEngineConfig::Impl(device_type)) {} + +MaceEngineConfig::~MaceEngineConfig() = default; + +MaceStatus MaceEngineConfig::SetGPUContext( + std::shared_ptr context) { + return impl_->SetGPUContext(context); +} + +MaceStatus MaceEngineConfig::SetGPUHints( + GPUPerfHint perf_hint, + GPUPriorityHint priority_hint) { + return impl_->SetGPUHints(perf_hint, priority_hint); +} + +MaceStatus MaceEngineConfig::SetCPUThreadPolicy( + int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp) { + return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp); +} + +MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids) { + return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids); +} + +DeviceType MaceEngineConfig::device_type() const { + return impl_->device_type(); +} + +int MaceEngineConfig::num_threads() const { + return impl_->num_threads(); +} + +std::shared_ptr MaceEngineConfig::gpu_context() const { + return impl_->gpu_context(); +} + +GPUPerfHint MaceEngineConfig::gpu_perf_hint() const { + return impl_->gpu_perf_hint(); +} + +GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const { + return impl_->gpu_priority_hint(); +} + // Mace Tensor class MaceTensor::Impl { public: @@ -155,7 +350,7 @@ std::shared_ptr MaceTensor::data() { return impl_->data; } // Mace Engine class MaceEngine::Impl { public: - explicit Impl(DeviceType device_type); + explicit Impl(const MaceEngineConfig &config); ~Impl(); @@ -178,6 +373,7 @@ class MaceEngine::Impl { size_t model_data_size_; std::shared_ptr op_registry_; DeviceType device_type_; + std::unique_ptr device_; std::unique_ptr ws_; std::unique_ptr net_; std::map input_info_map_; @@ -189,11 +385,12 @@ class MaceEngine::Impl { MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; -MaceEngine::Impl::Impl(DeviceType device_type) +MaceEngine::Impl::Impl(const MaceEngineConfig &config) : model_data_(nullptr), model_data_size_(0), op_registry_(new OperatorRegistry()), - device_type_(device_type), + device_type_(config.device_type()), + device_(nullptr), ws_(new Workspace()), net_(nullptr) #ifdef MACE_ENABLE_HEXAGON @@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type) #endif { LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); + if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) { + device_.reset(new CPUDevice(config.num_threads())); + } +#ifdef MACE_ENABLE_OPENCL + if (device_type_ == DeviceType::GPU) { + device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(), + config.gpu_context()->opencl_cache_storage(), + config.gpu_priority_hint(), + config.gpu_perf_hint(), + config.gpu_context()->opencl_binary_storage(), + config.num_threads())); + } +#endif } MaceStatus MaceEngine::Impl::Init( @@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init( // Check avalibility #ifdef MACE_ENABLE_OPENCL if (device_type_ == DeviceType::GPU) { - MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def)); + MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get())); } #endif // Get input and output information. @@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(input_info_map_)); } ws_->CreateTensor(MakeString("mace_input_node_", input_name), - GetDeviceAllocator(device_type_), DT_FLOAT); + device_->allocator(), DT_FLOAT); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(output_info_map_)); } ws_->CreateTensor(MakeString("mace_output_node_", output_name), - GetDeviceAllocator(device_type_), DT_FLOAT); + device_->allocator(), DT_FLOAT); } #ifdef MACE_ENABLE_HEXAGON if (device_type_ == HEXAGON) { @@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init( } } else { #endif - MACE_RETURN_IF_ERROR(ws_->LoadModelTensor( - *net_def, device_type_, model_data)); + MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, + device_.get(), + model_data)); // Init model - auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, + auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(), NetMode::INIT); MACE_RETURN_IF_ERROR(net->Run()); - net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); + net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get()); #ifdef MACE_ENABLE_HEXAGON } #endif if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data); + ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); } return MaceStatus::MACE_SUCCESS; } @@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run( #ifdef MACE_ENABLE_OPENCL if (device_type_ == GPU) { - OpenCLRuntime::Global()->SaveBuiltCLProgram(); + device_->opencl_runtime()->SaveBuiltCLProgram(); } #endif for (auto &output : *outputs) { @@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run( return MACE_SUCCESS; } -MaceEngine::MaceEngine(DeviceType device_type): - impl_(new MaceEngine::Impl(device_type)) {} +MaceEngine::MaceEngine(const MaceEngineConfig &config): + impl_(new MaceEngine::Impl(config)) {} MaceEngine::~MaceEngine() = default; @@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto( const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine) { LOG(INFO) << "Create MaceEngine from model pb"; // load model @@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto( std::shared_ptr net_def(new NetDef()); net_def->ParseFromArray(&model_pb[0], model_pb.size()); - engine->reset(new mace::MaceEngine(device_type)); + engine->reset(new mace::MaceEngine(config)); MaceStatus status = (*engine)->Init( net_def.get(), input_nodes, output_nodes, model_data_file); diff --git a/mace/libmace/mace_runtime.cc b/mace/libmace/mace_runtime.cc deleted file mode 100644 index 24b2cd8f32b04dc14a425cbd73945f6d7851a4a8..0000000000000000000000000000000000000000 --- a/mace/libmace/mace_runtime.cc +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/macros.h" -#include "mace/core/file_storage.h" -#include "mace/core/runtime/cpu/cpu_runtime.h" -#include "mace/public/mace_runtime.h" -#include "mace/utils/logging.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_runtime.h" -#endif // MACE_ENABLE_OPENCL - -namespace mace { - -class FileStorageFactory::Impl { - public: - explicit Impl(const std::string &path); - - std::unique_ptr CreateStorage(const std::string &name); - - private: - std::string path_; -}; - -FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {} - -std::unique_ptr FileStorageFactory::Impl::CreateStorage( - const std::string &name) { - return std::move(std::unique_ptr( - new FileStorage(path_ + "/" + name))); -} - -FileStorageFactory::FileStorageFactory(const std::string &path): - impl_(new FileStorageFactory::Impl(path)) {} - -FileStorageFactory::~FileStorageFactory() = default; - -std::unique_ptr FileStorageFactory::CreateStorage( - const std::string &name) { - return impl_->CreateStorage(name); -} - -extern std::shared_ptr kStorageFactory; - -void SetKVStorageFactory(std::shared_ptr storage_factory) { - VLOG(1) << "Set internal KV Storage Engine"; - kStorageFactory = storage_factory; -} - -// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe) -void SetOpenCLBinaryPaths(const std::vector &paths) { -#ifdef MACE_ENABLE_OPENCL - OpenCLRuntime::ConfigureOpenCLBinaryPath(paths); -#else - MACE_UNUSED(paths); -#endif // MACE_ENABLE_OPENCL -} - -extern std::string kOpenCLParameterPath; - -void SetOpenCLParameterPath(const std::string &path) { -#ifdef MACE_ENABLE_OPENCL - kOpenCLParameterPath = path; -#else - MACE_UNUSED(path); -#endif // MACE_ENABLE_OPENCL -} - -void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { -#ifdef MACE_ENABLE_OPENCL - VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint - << ", gpu_priority_hint: " << gpu_priority_hint; - OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint); -#else - MACE_UNUSED(gpu_perf_hint); - MACE_UNUSED(gpu_priority_hint); -#endif // MACE_ENABLE_OPENCL -} - -MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp) { - VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint - << ", affinity policy: " << policy; - return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, - policy, - use_gemmlowp); -} - -MaceStatus SetOpenMPThreadAffinity(int num_threads, - const std::vector &cpu_ids) { - return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids); -} - -MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, - std::vector *little_core_ids) { - return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids); -} - - -}; // namespace mace diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds index 76d8f1c2c553e09ac7aaa054999f4af543baa7b1..4bdc33db9f6162924285c52ecc9bbe76435d487d 100644 --- a/mace/libmace/mace_version_script.lds +++ b/mace/libmace/mace_version_script.lds @@ -1,15 +1,10 @@ mace { global: + *GPUContextBuilder*; + *MaceEngineConfig*; *MaceTensor*; *MaceEngine*; *CreateMaceEngineFromProto*; - *FileStorageFactory*; - *SetKVStorageFactory*; - *SetOpenCLBinaryPaths*; - *SetOpenCLParameterPath*; - *SetGPUHints*; - *SetOpenMPThreadPolicy*; - *SetOpenMPThreadAffinity*; *GetBigLittleCoreIDs*; *MaceVersion*; diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 07aad1d24b549c4be26a8521549e68aba50662c8..312bdc90babe7d04574a6823455893085771212e 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -23,8 +23,25 @@ cc_library( hdrs = [ "ops_test_util.h", ], + srcs = [ + "ops_test_util.cc", + ], + copts = [ + "-Werror", + "-Wextra", + ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", + ]) + if_opencl_enabled([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), deps = [ - "//mace/core", + "//mace/ops", "@gtest", ], ) @@ -36,6 +53,7 @@ cc_library( exclude = [ "*_test.cc", "*_benchmark.cc", + "ops_test_util.cc", "buffer_to_image.cc", "image_to_buffer.cc", "lstmcell.cc", diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 8938ea74b5dab1fb112a0843922aacac8c8b67cd..3b48891e769b7133f9b780a66d6ada4760b4ee7e 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -26,9 +26,10 @@ namespace ops { template class ActivationOp : public Operator { public: - ActivationOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(kernels::StringToActivationType( + ActivationOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), static_cast( diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index cc40ac9d7e2167c178ee10644b57d886fbc58289..49422f3a11969c89e1c55453bbfc40e44f797bdc 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -58,7 +58,7 @@ void TestSimpleRelu() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() { net.RunOp(D); } - auto expected = CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); + auto expected = net.CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -159,7 +159,7 @@ void TestSimpleRelux() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -209,7 +209,7 @@ void TestSimpleReluRelux() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -267,7 +267,7 @@ void TestSimplePrelu() { } if (D == DeviceType::CPU) { - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -318,7 +318,7 @@ void TestSimpleTanh() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092, -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758, @@ -371,7 +371,7 @@ void TestSimpleSigmoid() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01, 6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01, diff --git a/mace/ops/addn.h b/mace/ops/addn.h index 64373343363ff620d34ea735078c1291b7450616..4238a013e455723f9ad88cbdec8dee79be862885 100644 --- a/mace/ops/addn.h +++ b/mace/ops/addn.h @@ -26,8 +26,8 @@ namespace ops { template class AddNOp : public Operator { public: - AddNOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + AddNOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { Tensor *output_tensor = this->Output(0); diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 2f5aa28a78c575733337e48c721ae486ec8a9ce7..7154ad52d097a9c09144f0b1d1630ca8be538e20 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -39,7 +39,7 @@ void SimpleAdd2() { // Run net.RunOp(D); - auto expected = CreateTensor({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); + auto expected = net.CreateTensor({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -98,7 +98,7 @@ void SimpleAdd3() { } auto expected = - CreateTensor({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); + net.CreateTensor({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); } @@ -136,8 +136,8 @@ void RandomTest() { // run on cpu net.RunOp(); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu for (int i = 0; i < input_num; ++i) { @@ -160,7 +160,7 @@ void RandomTest() { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); } } diff --git a/mace/ops/argmax.h b/mace/ops/argmax.h index ce493059387fc0d6aff802b7db053b9e47c8cfcb..b1d7ec4efc4d7d448eb6676d838730bfd5450386 100644 --- a/mace/ops/argmax.h +++ b/mace/ops/argmax.h @@ -26,8 +26,8 @@ namespace ops { template class ArgMaxOp : public Operator { public: - ArgMaxOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc index bf00b57933969f394c61593baa10518085b3c92a..ca7ece351801ef781edbd04826a2fb285ee1f77c 100644 --- a/mace/ops/argmax_test.cc +++ b/mace/ops/argmax_test.cc @@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector &input_shape, } // Check - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index 9d983f10a6105d144f0fd1366814072327765a64..7221c3ca1f10b535d1e570f4356e3720ac298a7d 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -25,9 +25,9 @@ namespace ops { template class BatchNormOp : public Operator { public: - BatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(false, kernels::ActivationType::NOOP, 0.0f) { + BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, false, kernels::ActivationType::NOOP, 0.0f) { epsilon_ = OperatorBase::GetOptionalArg("epsilon", static_cast(1e-4)); } @@ -52,7 +52,8 @@ class BatchNormOp : public Operator { Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - return functor_(input, scale, offset, mean, var, epsilon_, output, future); + return functor_(input, scale, offset, + mean, var, epsilon_, output, future); } private: diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index b72ec73ad0e5a39da814ce29bc407e82b7e6e41c..7d5b77daf1eb3cdb7a4402f83657421618ff2f44 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -79,7 +79,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); @@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { @@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-1, 1e-2); } TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { @@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { @@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-1, 1e-2); } } // namespace test diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index 91c4a9ba8a929765d187c28777f02db06adc4e1b..fa1ed2c6a2534b62795cd3d2c541f722795ff9de 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -27,9 +27,10 @@ namespace ops { template class BatchToSpaceNDOp : public Operator { public: - BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), + BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), true) {} diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h index 901c1e74235b2b080730955082d68d0861c0b201..ee3de99116fea2a49153c2d1f79a73b570f8b02d 100644 --- a/mace/ops/bias_add.h +++ b/mace/ops/bias_add.h @@ -24,10 +24,11 @@ namespace ops { template class BiasAddOp : public Operator { public: - BiasAddOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(static_cast(OperatorBase::GetOptionalArg( - "data_format", NHWC))) {} + BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + static_cast(OperatorBase::GetOptionalArg( + "data_format", NHWC))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index c41584542c01ab2ea5594cca1242a0bf2242c596..51c8cc8871f370f878025b919bde91d92d39fba1 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -66,7 +66,7 @@ void BiasAddSimple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5}); @@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { @@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } // namespace test diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index 7c59c822d2e19f129248ddd76f3fd9bc69a5fe74..0fa34c30f52a339de00e5f1d5efd28fe844a433b 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -24,9 +24,10 @@ namespace ops { template class BufferToImageOp : public Operator { public: - BufferToImageOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} + BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input_tensor = this->Input(INPUT); diff --git a/mace/ops/cast.h b/mace/ops/cast.h index cee022ec4aedb9b848e9dc46b3e564e561c08b36..56d20d52cb97952476b46c993bba6024f59109c2 100644 --- a/mace/ops/cast.h +++ b/mace/ops/cast.h @@ -25,8 +25,8 @@ namespace ops { template class CastOp : public Operator { public: - CastOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + CastOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { MACE_UNUSED(future); diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h index bd9234c1abab8c3c6391f781a4b7177c1a82d5b1..a459a0b38e115ace4e4333ce5ca3dc5539f61afe 100644 --- a/mace/ops/channel_shuffle.h +++ b/mace/ops/channel_shuffle.h @@ -26,10 +26,10 @@ namespace ops { template class ChannelShuffleOp : public Operator { public: - ChannelShuffleOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), group_(OperatorBase::GetOptionalArg("group", 1)), - functor_(this->group_) {} + functor_(context, this->group_) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index 0b674dab57b6a24feb81eed7bba64415a969ecb3..2102fe7652b2b552d8f9c8caeb09abfa786c1a57 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { NHWC); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { kernels::BufferType::IN_OUT_CHANNEL); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 16}, {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); diff --git a/mace/ops/concat.h b/mace/ops/concat.h index be76371494a2116f180420ddadf75090bb103b54..94dee3d33dd8876183bb9934874b6f1cd4d2766f 100644 --- a/mace/ops/concat.h +++ b/mace/ops/concat.h @@ -26,9 +26,9 @@ namespace ops { template class ConcatOp : public Operator { public: - ConcatOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 3)) {} + ConcatOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} MaceStatus Run(StatsFuture *future) override { MACE_CHECK(this->InputSize() >= 2) diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index b15045cd18884e7112c19da7a6c5bdeab53560f0..5864e1edb0ad1bc4eed4c9db9d1411ea1a6499c2 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -28,9 +28,10 @@ namespace ops { template class Conv2dOp : public ConvPool2dOpBase { public: - Conv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), + Conv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), + functor_(context, + this->strides_.data(), this->padding_type_, this->paddings_, this->dilations_.data(), @@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase { OperatorBase::GetOptionalArg("max_limit", 0.0f), static_cast(OperatorBase::GetOptionalArg( "is_filter_transformed", false)), - ws->GetScratchBuffer(D)) {} + context->workspace()->GetScratchBuffer(D)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index ecfdafa2da6d257a0762dd7f06c71968c4348834..dd338e275356fd1bfab3cc21b50fade2b77da46e 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {18.1f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); @@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() { } // Check - auto expected = CreateTensor({1, 1, 1, 1}, {18.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {18.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -298,7 +298,7 @@ void TestNHWCCombined3x3() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } template @@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() { } // Check - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } @@ -515,7 +515,7 @@ void TestConv1x1() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 10, 2}, {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, @@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); }; @@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); }; @@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() { // Run net.Run(); // Check - auto expected = CreateTensor({1, 1, 1, 1}, {230}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {230}); ExpectTensorNear(*expected, *output); } diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h index 9c4860df735b1d59cc744ce18abb434f9a166c3b..0a8a8c174617dd0474ec4bdc8e82375c291f5f2a 100644 --- a/mace/ops/conv_pool_2d_base.h +++ b/mace/ops/conv_pool_2d_base.h @@ -26,8 +26,8 @@ namespace ops { template class ConvPool2dOpBase : public Operator { public: - ConvPool2dOpBase(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), strides_(OperatorBase::GetRepeatedArgs("strides")), padding_type_(static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(SAME)))), diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index ac184c80ba3295ccc4ab3f553940c0f485972030..8eecd77dca5d2ae149b3a3711b5c1d3cd631d6ad 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -21,6 +21,8 @@ namespace test { TEST(CoreTest, INIT_MODE) { std::vector op_defs; + Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU); + std::unique_ptr> tuner; Workspace ws; op_defs.emplace_back(OperatorDef()); @@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) { .AddIntArg("mode", static_cast(NetMode::INIT)) .Finalize(&op_defs[op_defs.size() - 1]); - Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU), + Tensor *input = ws.CreateTensor("Input", device->allocator(), DataTypeToEnum::v()); input->Resize({1, 3, 3, 3}); { @@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) { } std::shared_ptr op_registry(new OperatorRegistry()); auto net = - CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT); + CreateNet(op_registry, net_def, &ws, device, NetMode::INIT); net->Run(); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr); - net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU); + net = CreateNet(op_registry, net_def, &ws, device); net->Run(); EXPECT_TRUE(ws.GetTensor("Output") != nullptr); diff --git a/mace/ops/crop.h b/mace/ops/crop.h index f1f179b9457786e083ac9e04a7ee5231b5cfba40..f50450693580a0d193cac1975e5903e1e624cfd5 100644 --- a/mace/ops/crop.h +++ b/mace/ops/crop.h @@ -26,9 +26,10 @@ namespace ops { template class CropOp : public Operator { public: - CropOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 2), + CropOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("axis", 2), OperatorBase::GetRepeatedArgs("offset")) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index a28205b9301cf5e8f543fee457bcedec45d515e6..b4bb7fddf1e1c630e5ad3897fcd7558fe5c5662c 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -75,7 +75,7 @@ void RunCrop(const std::vector &input_shape, "Output", NHWC); } // Check - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output")); } } // namespace diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index 188b8ba04438532d14acda575899ba29e2d16353..ec5b348e201f4048398ea2f3b8f69fca63c5337a 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -26,9 +26,10 @@ namespace ops { template class Deconv2dOp : public Operator { public: - Deconv2dOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("strides"), + Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("strides"), static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(SAME))), OperatorBase::GetRepeatedArgs("padding_values"), diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 954d6bf41a69851ca2f872c684ee3e9a96b96610..67d0ac141e77218d2a047c4b472f1e13e661c8b8 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -79,7 +79,7 @@ void RunTestSimple(const std::vector &input_shape, "Output", NHWC); } - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.0001); } @@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 4be3f2a0dc08128eec9ca7141df414ab73c9bf81..49183873733cd4d878ad1113f64c76aa918744cd 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -27,10 +27,10 @@ namespace ops { template class DepthToSpaceOp : public Operator { public: - DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), block_size_(OperatorBase::GetOptionalArg("block_size", 1)), - functor_(this->block_size_, true) {} + functor_(context, this->block_size_, true) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index e61590ff9b28ab73d1cb7559cb9dc3b3622fe842..99c4fb0b6e4bf05a4e2c502731136966cabdd07e 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s, ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index 2762aea5f48114413b90cc0250ab010de4486244..549af07a3977b65464288a96096b42cb22c2ac6d 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -29,9 +29,10 @@ namespace ops { template class DepthwiseConv2dOp : public ConvPool2dOpBase { public: - DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), + DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), + functor_(context, + this->strides_.data(), this->padding_type_, this->paddings_, this->dilations_.data(), diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index a2d57911db1b7c136c87ec5c7b5ac6616f6ce289..6d6b84f1f79356d9f1eb6411fc564c912fca3e1d 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -80,7 +80,7 @@ void SimpleValidTest() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f}); @@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch, } auto expected = - CreateTensor({1, out_height, out_width, out_channels}, expect); + net.CreateTensor({1, out_height, out_width, out_channels}, expect); if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) { "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) { // Check if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, 1e-4); } else { - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-2, 1e-2); } }; @@ -387,7 +387,7 @@ void QuantSimpleValidTest() { net.Run(); // Check - auto expected = CreateTensor({1, 1, 1, 2}, {255, 21}); + auto expected = net.CreateTensor({1, 1, 1, 2}, {255, 21}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 161d0e4fd9b5dba81d5d9d504ecd4a608edbedd4..f795256218eed2087d372e1acdbe5ba1db2fce96 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -24,9 +24,10 @@ namespace ops { template class EltwiseOp : public Operator { public: - EltwiseOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + EltwiseOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), functor_( + context, static_cast(OperatorBase::GetOptionalArg( "type", static_cast(kernels::EltwiseType::NONE))), OperatorBase::GetRepeatedArgs("coeff"), diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 55a0ce977563c16e8e3914a5c435fed7cedbbabc..76b04f3423a31fd344edb4cadee02857dcc4a71a 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type, MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({}, {output}); + auto expected = net.CreateTensor({}, {output}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(shape, output); + auto expected = net.CreateTensor(shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, if (input0.size() < input1.size()) { output_shape = shape1; } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type, MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace @@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type, net.RunOp(DeviceType::CPU); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImg", kernels::BufferType::IN_OUT_CHANNEL); @@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } @@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type, net.RunOp(DeviceType::CPU); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input0", "InputImg0", kernels::BufferType::IN_OUT_CHANNEL); @@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } } // namespace diff --git a/mace/ops/fill.h b/mace/ops/fill.h index a8b55dbe8984f2d6f87e39e1d39373e9ad909b58..b6836d11978d7263439b03eda7b072feacf06c19 100644 --- a/mace/ops/fill.h +++ b/mace/ops/fill.h @@ -26,9 +26,9 @@ namespace ops { template class FillOp : public Operator { public: - FillOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_() {} + FillOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *shape = this->Input(SHAPE); diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h index 9cd76c738b12282ec7cff8974cd48923405c4910..345d87b476ded184fa7b02ba8c47072589e41bc6 100644 --- a/mace/ops/folded_batch_norm.h +++ b/mace/ops/folded_batch_norm.h @@ -26,9 +26,10 @@ namespace ops { template class FoldedBatchNormOp : public Operator { public: - FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(true, + FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + true, kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 3979583a1384dd962d850f912ee3546984c7cb76..16a6ad684809436832569a285158105e3b9137f2 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -83,7 +83,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); @@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { @@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-2, 1e-2); } TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { @@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { @@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-2, 1e-2); } } // namespace test diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h index 8ec0039185366a5419cf2b56dfd9317b3a5342a3..313780cb3b9b39d568005ee84fa154390b13e827 100644 --- a/mace/ops/fully_connected.h +++ b/mace/ops/fully_connected.h @@ -26,9 +26,9 @@ namespace ops { template class FullyConnectedOp : public Operator { public: - FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(kernels::StringToActivationType( + FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} @@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator { " don't match."); } - return functor_(input, weight, bias, output, future); + return functor_(input, weight, + bias, output, future); } private: diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 8b30096da8475217c48f05b85dde702c3e754edc..cdeba2439a94e5987c9844c4482f57af78dbb14c 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -76,7 +76,7 @@ void Simple(const std::vector &input_shape, } // Check - auto expected = CreateTensor(output_shape, output_value); + auto expected = net.CreateTensor(output_shape, output_value); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -156,8 +156,8 @@ void Random(const index_t batch, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -181,10 +181,10 @@ void Random(const index_t batch, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-1); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-3); } } diff --git a/mace/ops/gather.h b/mace/ops/gather.h index 37689b30a2765ee199db58f094a10f5513da8de6..fe4026d969835cc1dc456258194d40d7fb120584 100644 --- a/mace/ops/gather.h +++ b/mace/ops/gather.h @@ -24,9 +24,10 @@ namespace ops { template class GatherOp : public Operator { public: - GatherOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0), + GatherOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("axis", 0), OperatorBase::GetOptionalArg("y", 1.0)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc index 3a35b3380ff8280f99b705fdf72b59d7a89ca77d..07a8438c515c88a9ae2631f79e52f27d45bfe237 100644 --- a/mace/ops/gather_test.cc +++ b/mace/ops/gather_test.cc @@ -47,7 +47,7 @@ void TestGather(const std::vector &weight_shape, // Run net.RunOp(CPU); - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } diff --git a/mace/ops/identity.h b/mace/ops/identity.h index 7140314cc25fe5ce809f577de9a4e4ed9bd8ec1c..be4d75bf48d2c92281fe70d4014fc5b0f5b063fa 100644 --- a/mace/ops/identity.h +++ b/mace/ops/identity.h @@ -25,8 +25,8 @@ namespace ops { template class IdentityOp : public Operator { public: - IdentityOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + IdentityOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index c1b9b0b8a9d5af2b4ad79b7d6b2206db5b3677d8..fc259a01b9c2d7c5ac01cc05762bbe1d12abe2b5 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -24,9 +24,10 @@ namespace ops { template class ImageToBufferOp : public Operator { public: - ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} + ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/infer_conv2d_shape.h b/mace/ops/infer_conv2d_shape.h index bc6163c170524800a5e0bbe5d83b7c419aeb123b..a39f66b6dec6109909384592e9db9bb4cab601c8 100644 --- a/mace/ops/infer_conv2d_shape.h +++ b/mace/ops/infer_conv2d_shape.h @@ -26,8 +26,8 @@ namespace ops { template class InferConv2dShapeOp : public Operator { public: - InferConv2dShapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h index d8ad1d3eac999e315b1d84899643952cbd9997a1..66265f19e0fcef441e7374072c17cdd525e47f71 100644 --- a/mace/ops/local_response_norm.h +++ b/mace/ops/local_response_norm.h @@ -24,8 +24,8 @@ namespace ops { template class LocalResponseNormOp : public Operator { public: - LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), functor_() { + LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) { depth_radius_ = OperatorBase::GetOptionalArg("depth_radius", 5); bias_ = OperatorBase::GetOptionalArg("bias", 1.0f); alpha_ = OperatorBase::GetOptionalArg("alpha", 1.0f); diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc index dc12f28a32b157a89d45a4c91c22480664478917..6bb726ead5bf3f8fbe6173013d99557cbed03209 100644 --- a/mace/ops/local_response_norm_test.cc +++ b/mace/ops/local_response_norm_test.cc @@ -46,7 +46,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 6}, {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47}); diff --git a/mace/ops/lstmcell.h b/mace/ops/lstmcell.h index 300794f2341261a0ea13d1be0dffc48a3a6e1a78..3037c891ff5a9b7d9fb25096632556cce4193296 100644 --- a/mace/ops/lstmcell.h +++ b/mace/ops/lstmcell.h @@ -26,10 +26,12 @@ namespace ops { template class LSTMCellOp : public Operator { public: - LSTMCellOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast( - OperatorBase::GetOptionalArg("scalar_input", 0.0))) {} + LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast( + OperatorBase::GetOptionalArg("scalar_input", + 0.0))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h index e5e0dafaafdf547817727dad8079373858406dc6..ceccb9398aaa7d5b730951672c0370e5509e1f7f 100644 --- a/mace/ops/matmul.h +++ b/mace/ops/matmul.h @@ -24,8 +24,9 @@ namespace ops { template class MatMulOp : public Operator { public: - MatMulOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + MatMulOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context), transpose_a_(OperatorBase::GetOptionalArg("transpose_a", false)), transpose_b_(OperatorBase::GetOptionalArg("transpose_b", false)) { } @@ -46,7 +47,8 @@ class MatMulOp : public Operator { MACE_CHECK(ak == bk, "the number of A's column ", ak, " must be equal to B's row ", bk); - return functor_(A, B, C, transpose_a_, transpose_b_, future); + return functor_(A, B, C, + transpose_a_, transpose_b_, future); } private: diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index 18a9ddc88e7b439b68404696f9082ca788eb68c6..9225b2269d5f37f36c412b26695ab07f36788b69 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -65,7 +65,7 @@ void Simple(const std::vector &A_shape, } // Check - auto expected = CreateTensor(C_shape, C_value); + auto expected = net.CreateTensor(C_shape, C_value); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -171,15 +171,15 @@ void Complex(const std::vector &batch, // Check EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape()); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - expected.Reshape({batch_count, height, out_width}); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); + expected->Reshape({batch_count, height, out_width}); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-5); } } diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..5be4cb9696978de91def270ae880df203849fcd4 --- /dev/null +++ b/mace/ops/ops_test_util.cc @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +OpTestContext *OpTestContext::Get() { + static OpTestContext instance; + return &instance; +} + +std::shared_ptr OpTestContext::gpu_context() const { + return gpu_context_; +} + +Device *OpTestContext::GetDevice(DeviceType device_type) { + return device_map_[device_type].get(); +} + +OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) { + device_map_[DeviceType::CPU] = std::unique_ptr(new CPUDevice(-1)); + device_map_[DeviceType::GPU] = std::unique_ptr( + new GPUDevice(gpu_context_->opencl_tuner(), + gpu_context_->opencl_cache_storage(), + GPUPriorityHint::PRIORITY_NORMAL)); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 2dc29241a73bfc69ee681d01bf15a6b8d928f0f3..278c3515f575c2c72eaa9f9a9908db491fc0c3cd 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -26,7 +27,8 @@ #include "gtest/gtest.h" #include "mace/core/net.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/device_context.h" +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/kernels/opencl/common.h" @@ -110,9 +112,28 @@ class OpDefBuilder { OperatorDef op_def_; }; +class OpTestContext { + public: + static OpTestContext *Get(); + std::shared_ptr gpu_context() const; + Device *GetDevice(DeviceType device_type); + private: + OpTestContext(); + MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); + + std::shared_ptr gpu_context_; + std::map> device_map_; +}; + class OpsTestNet { public: - OpsTestNet() : op_registry_(new OperatorRegistry()) {} + OpsTestNet() : + op_registry_(new OperatorRegistry()) { + } + + ~OpsTestNet() { + Sync(); + } template void AddInputFromArray(const std::string &name, @@ -121,7 +142,8 @@ class OpsTestNet { const float scale = 0.0, const int32_t zero_point = 0) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -136,7 +158,8 @@ class OpsTestNet { const std::vector &shape, const T data) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -149,7 +172,8 @@ class OpsTestNet { bool positive = true, bool truncate = false) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -184,8 +208,10 @@ class OpsTestNet { template void Transpose2D(const std::string &src_name, const std::string &dst_name) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 2, "input shape != 2"); output->Resize({input_shape[1], input_shape[0]}); @@ -205,8 +231,10 @@ class OpsTestNet { void CopyData(const std::string &src_name, const std::string &dst_name) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); output->Resize(input_shape); @@ -222,8 +250,10 @@ class OpsTestNet { const std::string &dst_name, const DataFormat dst_format) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); @@ -352,8 +382,10 @@ class OpsTestNet { void FillNHWCInputToNCHWInput(const std::string &name_nchw, const std::string &name_nhwc) { Tensor *input = ws_.GetTensor(name_nhwc); - Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + name_nchw, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); index_t batch = input_shape[0]; index_t height = input_shape[1]; @@ -374,6 +406,22 @@ class OpsTestNet { } } + // Create standalone tensor on device D with T type. + template + std::unique_ptr CreateTensor( + const std::vector &shape = {}, + const std::vector &data = {}) { + std::unique_ptr res( + new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v())); + if (!data.empty()) { + res->Resize(shape); + T *input_data = res->mutable_data(); + memcpy(input_data, data.data(), data.size() * sizeof(T)); + } + return res; + } + OperatorDef *NewOperatorDef() { op_defs_.clear(); op_defs_.emplace_back(OperatorDef()); @@ -392,8 +440,9 @@ class OpsTestNet { for (auto &op_def_ : op_defs_) { net_def.add_op()->CopyFrom(op_def_); } - net_ = CreateNet(op_registry_, net_def, &ws_, device); - device_ = device; + net_ = CreateNet(op_registry_, net_def, &ws_, + OpTestContext::Get()->GetDevice(device)); + device_type_ = device; return net_ != nullptr; } @@ -416,10 +465,15 @@ class OpsTestNet { MaceStatus RunOp() { return RunOp(DeviceType::CPU); } MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { - device_ = device; - net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT); + device_type_ = device; + net_ = CreateNet(op_registry_, + net_def, + &ws_, + OpTestContext::Get()->GetDevice(device), + NetMode::INIT); MACE_RETURN_IF_ERROR(net_->Run()); - net_ = CreateNet(op_registry_, net_def, &ws_, device); + net_ = CreateNet(op_registry_, net_def, &ws_, + OpTestContext::Get()->GetDevice(device)); return net_->Run(); } @@ -432,9 +486,12 @@ class OpsTestNet { } void Sync() { - if (net_ && device_ == DeviceType::GPU) { - OpenCLRuntime::Global()->command_queue().finish(); +#ifdef MACE_ENABLE_OPENCL + if (net_ && device_type_ == DeviceType::GPU) { + OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime() + ->command_queue().finish(); } +#endif } public: @@ -442,17 +499,17 @@ class OpsTestNet { Workspace ws_; std::vector op_defs_; std::unique_ptr net_; - DeviceType device_; + DeviceType device_type_; }; class OpsTestBase : public ::testing::Test { protected: virtual void SetUp() { - // OpenCLRuntime::CreateGlobal(); + SetOpenMPThreadsAndAffinityPolicy(-1, + CPUAffinityPolicy::AFFINITY_BIG_ONLY); } virtual void TearDown() { - // OpenCLRuntime::DestroyGlobal(); } }; @@ -510,17 +567,6 @@ std::vector VectorStaticCast(const std::vector &&src) { return std::move(dest); } -template -std::unique_ptr CreateTensor(const std::vector &shape, - const std::vector &data) { - std::unique_ptr res( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v())); - res->Resize(shape); - T *input_data = res->mutable_data(); - memcpy(input_data, data.data(), data.size() * sizeof(T)); - return res; -} - inline bool IsSameSize(const Tensor &x, const Tensor &y) { if (x.dim_size() != y.dim_size()) return false; for (int d = 0; d < x.dim_size(); ++d) { diff --git a/mace/ops/pad.h b/mace/ops/pad.h index 9867710917fd64983fbb8c006bda092baa0b04b0..6a7ce1027946497cb287618a9320b33887aafcdd 100644 --- a/mace/ops/pad.h +++ b/mace/ops/pad.h @@ -26,9 +26,10 @@ namespace ops { template class PadOp : public Operator { public: - PadOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("paddings"), + PadOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("paddings"), OperatorBase::GetOptionalArg("constant_value", 0.0)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index 2f4a97214a04ab1df8f78005cf2da6f82c819643..3a68248eb5dfc157b3c3111e910b2928fb9b6369 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -63,7 +63,7 @@ void Simple() { auto output = net.GetTensor("Output"); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 5, 6, 1}, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, @@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) { auto output = net.GetTensor("Output"); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 4}, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, @@ -134,8 +134,8 @@ void Complex(const std::vector &input_shape, net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -155,9 +155,9 @@ void Complex(const std::vector &input_shape, auto output = net.GetTensor("OpenCLOutput"); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *output, 1e-2, 1e-2); + ExpectTensorNear(*expected, *output, 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *output, 1e-5); + ExpectTensorNear(*expected, *output, 1e-5); } } } // namespace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index fac4e1dd53b62c811aa40f2b7dfe7b96c1610213..3d1753b399489766da17a2245ef2dc4f92f8683d 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -27,13 +27,14 @@ namespace ops { template class PoolingOp : public ConvPool2dOpBase { public: - PoolingOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), + PoolingOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), kernels_(OperatorBase::GetRepeatedArgs("kernels")), pooling_type_( static_cast(OperatorBase::GetOptionalArg( "pooling_type", static_cast(AVG)))), - functor_(pooling_type_, + functor_(context, + pooling_type_, kernels_.data(), this->strides_.data(), this->padding_type_, diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 72a4fdeef86077ff8633a98a14bca24642cfed0e..2f02d729ed45aa6a160af5d42d09bcc915650481 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); + net.CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) { NHWC); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { NHWC); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { NHWC); // Check - auto expected = CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); + auto expected = net.CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() { } // Check - auto expected = CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); + auto expected = net.CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector &input_shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector &input_shape, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } } // namespace @@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) { NHWC); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() { kernels::BufferType::IN_OUT_CHANNEL); // Check - auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); + auto expected = net.CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector &shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector &shape, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, 1e-3); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } } // namespace @@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) { // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); + net.CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) { net.RunOp(); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) { net.RunOp(); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h index 1afabb8fe36800a4e09af30d0e14dd9586256376..d879e240ca200d5fbd09212a7e0ecde68314c47e 100644 --- a/mace/ops/proposal.h +++ b/mace/ops/proposal.h @@ -24,9 +24,10 @@ namespace ops { template class ProposalOp : public Operator { public: - ProposalOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("min_size", 16), + ProposalOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("min_size", 16), OperatorBase::GetOptionalArg("nms_thresh", 0.7), OperatorBase::GetOptionalArg("pre_nms_top_n", 6000), OperatorBase::GetOptionalArg("post_nms_top_n", 300), diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc index c5b71ad24d892fefb09eb3999b1d92d27113ff62..e8b2ae5aad79dbab8f08e89006a7e38ff40360d0 100644 --- a/mace/ops/proposal_test.cc +++ b/mace/ops/proposal_test.cc @@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) { // Run net.RunOp(); - auto expected_tensor = CreateTensor({1, 1, 1, 5}, {0, 0, 0, 255, 255}); + auto expected_tensor = net.CreateTensor({1, 1, 1, 5}, + {0, 0, 0, 255, 255}); ExpectTensorNear(*expected_tensor, *net.GetTensor("Output"), 1e-5); } diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h index eb78489bb53a70c9321bcf37bd1abb6c8543b5ac..2e7a77c2c624e5cc551898bc0b6d971eba580b1a 100644 --- a/mace/ops/quantize.h +++ b/mace/ops/quantize.h @@ -24,8 +24,9 @@ namespace ops { template class QuantizeOp : public Operator { public: - QuantizeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context), non_zero_( static_cast(OperatorBase::GetOptionalArg("non_zero", 0))) {} @@ -50,8 +51,8 @@ class QuantizeOp : public Operator { template class DequantizeOp : public Operator { public: - DequantizeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/reduce_mean.h b/mace/ops/reduce_mean.h index 7cdaff86fbd6714417678c27d36357a3d9cde4e3..0ef9c10274abbb28b6fb86bba2591e28ab0e38d2 100644 --- a/mace/ops/reduce_mean.h +++ b/mace/ops/reduce_mean.h @@ -27,9 +27,10 @@ namespace ops { template class ReduceMeanOp : public Operator { public: - ReduceMeanOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("axis"), + ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("axis"), OperatorBase::GetOptionalArg("keepdims", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc index 4f5a029e836aee7671256c790850f7c6044e11a1..2b1875ded8fb030234f818b0061067099d8ed467 100644 --- a/mace/ops/reduce_mean_test.cc +++ b/mace/ops/reduce_mean_test.cc @@ -57,7 +57,7 @@ void Simple(const std::vector &input_shape, ImageToBuffer(&net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); } diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h index c47e6cb1791e2fbd3e1fa1aa0506d9189f6dd0f1..86476de06bb5cb65e55bc623218fb7f97f1e3819 100644 --- a/mace/ops/reshape.h +++ b/mace/ops/reshape.h @@ -26,8 +26,8 @@ namespace ops { template class ReshapeOp : public Operator { public: - ReshapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + ReshapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h index a83f3a310afc02ca3abd474b4481e16470f28953..23b4c116b660ae814e9c8085a7cbf90712861c02 100644 --- a/mace/ops/resize_bicubic.h +++ b/mace/ops/resize_bicubic.h @@ -24,9 +24,10 @@ namespace ops { template class ResizeBicubicOp : public Operator { public: - ResizeBicubicOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("size", {-1, -1}), + ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("size", {-1, -1}), OperatorBase::GetOptionalArg("align_corners", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 7c7bd8bc263dd579fc3576a278550a894f97a7d3..97da04804395fdbe13e1fef70ca619ce4f06c771 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } @@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { NHWC); // Check - auto expected = CreateTensor({1, 2, 3, 3}, + auto expected = net.CreateTensor({1, 2, 3, 3}, {0., 1., 2., 4.110297, 5.110297, 6.110297, 8.223037, 9.223036, 10.223037, 24., 25., 26., 28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038}); @@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h index fb3898599ef706c6dc160158a074cb2ff663d986..f328a9a45e152b162ea0b7e978d078b0d5dbac29 100644 --- a/mace/ops/resize_bilinear.h +++ b/mace/ops/resize_bilinear.h @@ -24,9 +24,10 @@ namespace ops { template class ResizeBilinearOp : public Operator { public: - ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("size", {-1, -1}), + ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("size", {-1, -1}), OperatorBase::GetOptionalArg("align_corners", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 49dda888ee3ebceab58ec9f82830cef05d8d3ebe..5d284f867a88c4acedbeb9293372dba7b9e1ea9d 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -116,8 +116,8 @@ void TestRandomResizeBilinear() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", @@ -136,7 +136,7 @@ void TestRandomResizeBilinear() { kernels::BufferType::IN_OUT_CHANNEL); } // Check - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, 1e-6); } } diff --git a/mace/ops/scalar_math.h b/mace/ops/scalar_math.h index 29cb478c718f0d7eef1a8c1e18c61550ca9f2cee..356c93719894353a35459371b9f04d5f821a540a 100644 --- a/mace/ops/scalar_math.h +++ b/mace/ops/scalar_math.h @@ -26,9 +26,10 @@ namespace ops { template class ScalarMathOp : public Operator { public: - ScalarMathOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast( + ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast( OperatorBase::GetOptionalArg( "type", static_cast(kernels::EltwiseType::NONE))), OperatorBase::GetRepeatedArgs("coeff"), diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc index 32b9db0001f4c9edb5639e90683bb5ac49a3449d..0d34b80abb16cf4e7f6126f2d74e9c5ce8770fe0 100644 --- a/mace/ops/scalar_math_test.cc +++ b/mace/ops/scalar_math_test.cc @@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type, net.RunOp(D); - auto expected = CreateTensor({}, {output}); + auto expected = net.CreateTensor({}, {output}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace TEST_F(ScalarMathOpTest, SimpleCPU) { -ScalarMathTest( - kernels::EltwiseType::SUM, 1, 2, 3, 3); -ScalarMathTest( - kernels::EltwiseType::SUB, 1, 2, 3, -1); -ScalarMathTest( - kernels::EltwiseType::PROD, 3, -2, 3, -6); -ScalarMathTest( - kernels::EltwiseType::DIV, 3, -2, 1, -1.5); -ScalarMathTest( - kernels::EltwiseType::MIN, 3, -2, 1, -2); -ScalarMathTest( - kernels::EltwiseType::MAX, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::NEG, 3, -2, 1, -3); -ScalarMathTest( - kernels::EltwiseType::ABS, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); -ScalarMathTest( - kernels::EltwiseType::POW, 3, 1, 1, 3); -ScalarMathTest( - kernels::EltwiseType::EQUAL, 3, 3, 1, 1); + ScalarMathTest( + kernels::EltwiseType::SUM, 1, 2, 3, 3); + ScalarMathTest( + kernels::EltwiseType::SUB, 1, 2, 3, -1); + ScalarMathTest( + kernels::EltwiseType::PROD, 3, -2, 3, -6); + ScalarMathTest( + kernels::EltwiseType::DIV, 3, -2, 1, -1.5); + ScalarMathTest( + kernels::EltwiseType::MIN, 3, -2, 1, -2); + ScalarMathTest( + kernels::EltwiseType::MAX, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::NEG, 3, -2, 1, -3); + ScalarMathTest( + kernels::EltwiseType::ABS, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); + ScalarMathTest( + kernels::EltwiseType::POW, 3, 1, 1, 3); + ScalarMathTest( + kernels::EltwiseType::EQUAL, 3, 3, 1, 1); } TEST_F(ScalarMathOpTest, SimpleGPU) { -ScalarMathTest( - kernels::EltwiseType::SUM, 1, 2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SUB, 1, 2, 1, -1); -ScalarMathTest( - kernels::EltwiseType::PROD, 3, -2, 1, -6); -ScalarMathTest( - kernels::EltwiseType::DIV, 3, -2, 1, -1.5); -ScalarMathTest( - kernels::EltwiseType::MIN, 3, -2, 1, -2); -ScalarMathTest( - kernels::EltwiseType::MAX, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::NEG, 3, -2, 1, -3); -ScalarMathTest( - kernels::EltwiseType::ABS, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); -ScalarMathTest( - kernels::EltwiseType::POW, 3, 1, 1, 3); -ScalarMathTest( - kernels::EltwiseType::EQUAL, 3, 3, 1, 1); + ScalarMathTest( + kernels::EltwiseType::SUM, 1, 2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SUB, 1, 2, 1, -1); + ScalarMathTest( + kernels::EltwiseType::PROD, 3, -2, 1, -6); + ScalarMathTest( + kernels::EltwiseType::DIV, 3, -2, 1, -1.5); + ScalarMathTest( + kernels::EltwiseType::MIN, 3, -2, 1, -2); + ScalarMathTest( + kernels::EltwiseType::MAX, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::NEG, 3, -2, 1, -3); + ScalarMathTest( + kernels::EltwiseType::ABS, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); + ScalarMathTest( + kernels::EltwiseType::POW, 3, 1, 1, 3); + ScalarMathTest( + kernels::EltwiseType::EQUAL, 3, 3, 1, 1); } } // namespace test } // namespace ops diff --git a/mace/ops/shape.h b/mace/ops/shape.h index 98f139e44877756875cd8f0d7ee6335b35ae75bc..abb9ffb3197bf53c46881e53bc01c3f4c072bae3 100644 --- a/mace/ops/shape.h +++ b/mace/ops/shape.h @@ -25,8 +25,8 @@ namespace ops { template class ShapeOp : public Operator { public: - ShapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + ShapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h index 0a6868f05e2393aca3fccfe0fd535964c079c194..047402f0c0c5bf45f25ff58405359013e6ce0fa4 100644 --- a/mace/ops/softmax.h +++ b/mace/ops/softmax.h @@ -24,8 +24,9 @@ namespace ops { template class SoftmaxOp : public Operator { public: - SoftmaxOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *logits = this->Input(LOGITS); diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 827067f4ce093b42539cc388fefb13ffa691b905..012424c5b5d3deeed00fc73beb05b02063cd3374 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -29,7 +29,7 @@ void Simple() { // Add input data net.AddInputFromArray("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4}); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); @@ -113,8 +113,8 @@ void Complex(const std::vector &logits_shape) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -131,7 +131,7 @@ void Complex(const std::vector &logits_shape) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } // namespace diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index 7ce0dd135fcd8c43844db52740e993ba8aafd6ab..170bde09b0876edb370f7873f3f9fa09e55d67ce 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -27,9 +27,10 @@ namespace ops { template class SpaceToBatchNDOp : public Operator { public: - SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), + SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), false) {} diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 5539bfd628a5f15c1a8511b47ebf3d8f5ff322af..8a3c35feff500ccf180b23de814ca5c89569c74b 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector &space_shape, const std::vector &padding_data, const std::vector &batch_shape, const std::vector &batch_data) { - auto space_tensor = std::unique_ptr( - new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); + OpsTestNet net; + auto space_tensor = net.CreateTensor(); space_tensor->Resize(space_shape); { Tensor::MappingGuard space_mapper(space_tensor.get()); - T *space_ptr = space_tensor->mutable_data(); + T *space_ptr = space_tensor->template mutable_data(); MACE_CHECK(static_cast(space_tensor->size()) == space_data.size()) << "Space tensor size:" << space_tensor->size() << ", space data size:" << space_data.size(); memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T)); } - auto batch_tensor = std::unique_ptr( - new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); + auto batch_tensor = net.CreateTensor(); batch_tensor->Resize(batch_shape); { Tensor::MappingGuard batch_mapper(batch_tensor.get()); - T *batch_ptr = batch_tensor->mutable_data(); + T *batch_ptr = batch_tensor->template mutable_data(); MACE_CHECK(static_cast(batch_tensor->size()) == batch_data.size()); memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T)); } diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 44ca7e5c5b72a69cb8bab5f1f665b71bd64ede35..75dd27ed04a4a49a85a7e6c8d760bc0a76c1928b 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -27,9 +27,11 @@ namespace ops { template class SpaceToDepthOp : public Operator { public: - SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("block_size", 1), false) {} + SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("block_size", 1), + false) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/split.h b/mace/ops/split.h index 710cdfb343de578c59830022b5e702e5ee99dd18..aa41aa15c6bb6a2f181d514b916859c252aeffb1 100644 --- a/mace/ops/split.h +++ b/mace/ops/split.h @@ -26,9 +26,9 @@ namespace ops { template class SplitOp : public Operator { public: - SplitOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 3)) {} + SplitOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} MaceStatus Run(StatsFuture *future) override { MACE_CHECK(this->OutputSize() >= 2) diff --git a/mace/ops/squeeze.h b/mace/ops/squeeze.h index 35b2aed4c2585f5bc85c427962270d9e35baf973..7febfb0e20b377c54493623910c64f18228da487 100644 --- a/mace/ops/squeeze.h +++ b/mace/ops/squeeze.h @@ -26,8 +26,8 @@ namespace ops { template class SqueezeOp : public Operator { public: - SqueezeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + SqueezeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), axis_(OperatorBase::GetRepeatedArgs("axis", {})) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/stack.h b/mace/ops/stack.h index 17210fb29259cfbdf52b91840424863c0c3c62c4..be25c0b079cf014eb171c2b4f311e038ac256892 100644 --- a/mace/ops/stack.h +++ b/mace/ops/stack.h @@ -26,9 +26,9 @@ namespace ops { template class StackOp : public Operator { public: - StackOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0)) {} + StackOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} MaceStatus Run(StatsFuture *future) override { const std::vector &inputs = this->Inputs(); diff --git a/mace/ops/strided_slice.h b/mace/ops/strided_slice.h index 57653359c2b0d4333ed8e04517c699e60b7439b3..249dc3e9d07b7b59665faedc10cb7c320f1c9aea 100644 --- a/mace/ops/strided_slice.h +++ b/mace/ops/strided_slice.h @@ -24,9 +24,10 @@ namespace ops { template class StridedSliceOp : public Operator { public: - StridedSliceOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("begin_mask", 0), + StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("begin_mask", 0), OperatorBase::GetOptionalArg("end_mask", 0), OperatorBase::GetOptionalArg("ellipsis_mask", 0), OperatorBase::GetOptionalArg("new_axis_mask", 0), diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h index 1ad73db91ede576ddde3406648d41b61fd630e4b..91aa3365a3606b3f8899e4ca07141fba7011fc7d 100644 --- a/mace/ops/transpose.h +++ b/mace/ops/transpose.h @@ -26,10 +26,10 @@ namespace mace { template class TransposeOp : public Operator { public: - TransposeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + TransposeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), dims_(OperatorBase::GetRepeatedArgs("dims")), - functor_(dims_) {} + functor_(context, dims_) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/unstack.h b/mace/ops/unstack.h index 1f743bd5974e99f758b8922506f0588c81b419ff..1c3d1764972f6f8dc40e7353a2445e1e0ee6421d 100644 --- a/mace/ops/unstack.h +++ b/mace/ops/unstack.h @@ -26,9 +26,9 @@ namespace ops { template class UnstackOp : public Operator { public: - UnstackOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0)) {} + UnstackOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 2406a3614a3acb49788c2bc2ac72338e068b0a1a..3cd5ab92b7a5aa0def56ed83bb58847042b2fc20 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch, // Transfer output ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); - Tensor expected; - expected.Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected.shape(); + + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("ConvOutput")); + auto output_shape = expected->shape(); // Winograd convolution // transform filter @@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch, ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-5, 1e-4); } } } // namespace @@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch, // Transfer output ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); - Tensor expected; - expected.Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected.shape(); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("ConvOutput")); + auto output_shape = expected->shape(); // Winograd convolution // transform filter @@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch, ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-5, 1e-4); } } } // namespace diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h index 0349de8ace51322cdc715c9bc81ee3c4ec21b2bb..548c889a2538b147eae895f24f7b844de5fc6e1c 100644 --- a/mace/ops/winograd_inverse_transform.h +++ b/mace/ops/winograd_inverse_transform.h @@ -29,9 +29,11 @@ namespace ops { template class WinogradInverseTransformOp : public Operator { public: - WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(kernels::StringToActivationType( + WinogradInverseTransformOp(const OperatorDef &op_def, + OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), OperatorBase::GetOptionalArg("max_limit", 0.0f), diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h index db874287a4dae9b09cee727516789b52e3349399..2274b6e8a8c29aa0a4d46cda6a344206055aa0fa 100644 --- a/mace/ops/winograd_transform.h +++ b/mace/ops/winograd_transform.h @@ -26,9 +26,10 @@ namespace ops { template class WinogradTransformOp : public Operator { public: - WinogradTransformOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast(OperatorBase::GetOptionalArg( + WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(VALID))), OperatorBase::GetRepeatedArgs("padding_values"), OperatorBase::GetOptionalArg( diff --git a/mace/public/BUILD b/mace/public/BUILD index 3669d59518f3b89484626d1f023195f58395b924..b434312bcfdd4ec65a78bfc879a2dfcb41cc129c 100644 --- a/mace/public/BUILD +++ b/mace/public/BUILD @@ -11,7 +11,6 @@ cc_library( name = "public", hdrs = [ "mace.h", - "mace_runtime.h", ], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], ) diff --git a/mace/public/mace.h b/mace/public/mace.h index f6116348f7f5874021271fd04feb680c615df4c7..0b743423b3557ed2c5687334a96c0285e4d125d7 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -24,12 +24,36 @@ #include #include +#ifndef MACE_API +#define MACE_API __attribute__((visibility("default"))) +#endif + namespace mace { class NetDef; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; +enum GPUPerfHint { + PERF_DEFAULT = 0, + PERF_LOW = 1, + PERF_NORMAL = 2, + PERF_HIGH = 3 +}; + +enum GPUPriorityHint { + PRIORITY_DEFAULT = 0, + PRIORITY_LOW = 1, + PRIORITY_NORMAL = 2, + PRIORITY_HIGH = 3 +}; + +enum CPUAffinityPolicy { + AFFINITY_NONE = 0, + AFFINITY_BIG_ONLY = 1, + AFFINITY_LITTLE_ONLY = 2, +}; + struct CallStats { int64_t start_micros; int64_t end_micros; @@ -73,14 +97,167 @@ enum MaceStatus { } \ } +/// \brief Get ARM big.LITTLE configuration. +/// +/// This function will detect the max frequencies of all CPU cores, and assume +/// the cores with largest max frequencies as big cores, and all the remaining +/// cores as little. If all cpu core's max frequencies equals, big_core_ids and +/// little_core_ids will both be filled with all cpu core ids. +/// +/// \param [out] big_core_ids +/// \param [out] little_core_ids +/// \return If successful, it returns MACE_SUCCESS and error if it can't +/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK). + +MACE_API MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids); + +/// \brief GPU context contain the status used for GPU device. +/// +/// The life cycle of GPUContext object is the same as MaceEngines use it. +/// Just use one GPUContext for all MaceEngines, which will speed up the +/// initialization procedure. There are some data in common between different +/// MaceEngines using GPU, use one GPUContext could avoid duplication. +class GPUContext; + +/// \brief GPUContext builder. +/// +/// Use the GPUContextBuilder to generate GPUContext. +/// Not thread-safe +class MACE_API GPUContextBuilder { + public: + GPUContextBuilder(); + ~GPUContextBuilder(); + GPUContextBuilder(const GPUContextBuilder &) = delete; + GPUContextBuilder(const GPUContextBuilder &&) = delete; + GPUContextBuilder &operator=(const GPUContextBuilder &) = delete; + GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete; + + /// \brief Set internal storage factory to store internal data. + /// + /// Now the path is used to store the built OpenCL binaries to file, + /// which could speed up the GPU initialization and first run. + /// If do not call this API, the initialization maybe slow for GPU. + /// + /// \param path Make sure your program have Read/Write permission of the path + /// \return + GPUContextBuilder &SetStoragePath(const std::string &path); + /// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length) + /// + /// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length) + /// OpenCL binary is corresponding to the OpenCL Driver version, + /// you should update the binary when OpenCL Driver changed. + /// + /// \param paths MACE will use first file found in all paths + /// \return + GPUContextBuilder &SetOpenCLBinaryPaths( + const std::vector &paths); + /// \brief Set the path of Generated OpenCL parameter file + /// + /// If you use gpu for specific soc, The parameters is the local work group + /// size tuned for specific SOC, which may be faster than the + /// general parameters. + /// + /// \param path Make sure your program have Read/Write permission of the path + /// \return + GPUContextBuilder &SetOpenCLParameterPath(const std::string &path); + + std::shared_ptr Finalize(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +class MACE_API MaceEngineConfig { + public: + explicit MaceEngineConfig(const DeviceType device_type); + ~MaceEngineConfig(); + MaceEngineConfig(const MaceEngineConfig &) = delete; + MaceEngineConfig(const MaceEngineConfig &&) = delete; + MaceEngineConfig &operator=(const MaceEngineConfig &) = delete; + MaceEngineConfig &operator=(const MaceEngineConfig &&) = delete; + + /// \brief Set GPUContext + /// + /// Just use one GPUContext for multiple models run on GPU. + /// \param context created use GPUContextBuilder + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetGPUContext(std::shared_ptr context); + + /// \brief Set GPU hints, currently only supports Adreno GPU. + /// + /// Caution: this function may hurt performance + /// if improper parameters provided. + /// + /// \param perf_hint performance hint + /// \param priority_hint priority hint + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetGPUHints(GPUPerfHint perf_hint, + GPUPriorityHint priority_hint); + + /// \brief Set CPU threads number and affinity policy. + /// + /// Caution: this function may hurt performance if improper + /// parameters provided. When num_threads_hint is zero or negative, + /// the function will set the threads number equaling to the number of + /// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all + /// (AFFINITY_NONE) cores according to the policy. The threads number will + /// also be truncated to the corresponding cores number when num_threads_hint + /// is larger than it. + /// The OpenMP threads will be bind to (via sched_setaffinity) big cores + /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). + /// + /// \param num_threads_hint it is only a hint. + /// \param policy one of CPUAffinityPolicy + /// \param status MACE_SUCCESS for successful, or it can't reliabley + /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's + /// suggested to use AFFINITY_NONE to use all cores. + /// \param use_gemmlowp use gemmlowp for quantized inference + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetCPUThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp = false); + + /// \brief Set OpenMP threads number and processor affinity. + /// + /// Caution: this function may hurt performance + /// if improper parameters provided. + /// This function may not work well on some chips (e.g. MTK). Setting thread + /// affinity to offline cores may run very slow or unexpectedly. + /// In such cases, please use SetOpenMPThreadPolicy with default policy + /// instead. + /// + /// \param num_threads + /// \param cpu_ids + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids); + + DeviceType device_type() const; + + int num_threads() const; + + std::shared_ptr gpu_context() const; + + GPUPriorityHint gpu_priority_hint() const; + + GPUPerfHint gpu_perf_hint() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + // MACE input/output tensor -class __attribute__((visibility("default"))) MaceTensor { +class MACE_API MaceTensor { public: // shape - the shape of the tensor, with size n // data - the buffer of the tensor, must not be null with size equals // shape[0] * shape[1] * ... * shape[n-1] - explicit MaceTensor(const std::vector &shape, - std::shared_ptr data); + MaceTensor(const std::vector &shape, + std::shared_ptr data); MaceTensor(); MaceTensor(const MaceTensor &other); MaceTensor(const MaceTensor &&other); @@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor { std::unique_ptr impl_; }; -class __attribute__((visibility("default"))) MaceEngine { +class MACE_API MaceEngine { public: - explicit MaceEngine(DeviceType device_type); + explicit MaceEngine(const MaceEngineConfig &config); ~MaceEngine(); MaceStatus Init(const NetDef *net_def, @@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine { /// \param model_data_file[in]: the path of model data file /// \param input_nodes[in]: the array of input nodes' name /// \param output_nodes[in]: the array of output nodes' name -/// \param device_type[in]: one of [CPU, GPU, HEXAGON], -/// based on the runtime type of your model deployment file. +/// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// MACE_OUT_OF_RESOURCES for resources is out of range. -__attribute__((visibility("default"))) -MaceStatus CreateMaceEngineFromProto( +MACE_API MaceStatus CreateMaceEngineFromProto( const std::vector &model_pb, const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine); } // namespace mace diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h deleted file mode 100644 index 4cd60d2b60633c7df5c30de45ecd26df64581cc3..0000000000000000000000000000000000000000 --- a/mace/public/mace_runtime.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file defines runtime tuning APIs. -// These APIs are not stable. - -#ifndef MACE_PUBLIC_MACE_RUNTIME_H_ -#define MACE_PUBLIC_MACE_RUNTIME_H_ - -#include -#include -#include -#include - -#include "mace/public/mace.h" - -namespace mace { - -enum GPUPerfHint { - PERF_DEFAULT = 0, - PERF_LOW = 1, - PERF_NORMAL = 2, - PERF_HIGH = 3 -}; - -enum GPUPriorityHint { - PRIORITY_DEFAULT = 0, - PRIORITY_LOW = 1, - PRIORITY_NORMAL = 2, - PRIORITY_HIGH = 3 -}; - -enum CPUAffinityPolicy { - AFFINITY_NONE = 0, - AFFINITY_BIG_ONLY = 1, - AFFINITY_LITTLE_ONLY = 2, -}; - -class KVStorage { - public: - // return: 0 for success, -1 for error - virtual int Load() = 0; - virtual void Clear() = 0; - virtual bool Insert(const std::string &key, - const std::vector &value) = 0; - virtual const std::vector *Find(const std::string &key) = 0; - // return: 0 for success, -1 for error - virtual int Flush() = 0; - virtual ~KVStorage() {} -}; - -class KVStorageFactory { - public: - virtual std::unique_ptr CreateStorage(const std::string &name) = 0; -}; - -class __attribute__((visibility("default"))) FileStorageFactory - : public KVStorageFactory { - public: - // You have to make sure your APP have read and write permission of the path. - explicit FileStorageFactory(const std::string &path); - - ~FileStorageFactory(); - - std::unique_ptr CreateStorage(const std::string &name) override; - - private: - class Impl; - std::unique_ptr impl_; -}; - -/// \brief Set internal storage factory to store internal data. (Call once) -/// -/// Now the path is used to store the built OpenCL binaries to file, -/// which could speed up the GPU initialization and first run. -/// If do not call this API, the initialization maybe slow for GPU. -/// -/// \param path Make sure your program have Read/Write permission of the path -/// \return -__attribute__((visibility("default"))) -void SetKVStorageFactory(std::shared_ptr storage_factory); - -/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length) -/// -/// Just call once. (Not thread-safe) -/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length) -/// OpenCL binary is corresponding to the OpenCL Driver version, -/// you should update the binary when OpenCL Driver changed. -/// -/// \param paths MACE will use first file found in all paths -/// \return -__attribute__((visibility("default"))) -void SetOpenCLBinaryPaths(const std::vector &paths); - -/// \brief Set the path of Generated OpenCL parameter file -/// -/// Just call once. (Not thread-safe) -/// If you use gpu for specific soc, The parameters is the local work group -/// size tuned for specific SOC, which may be faster than the -/// general parameters. -/// -/// \param path Make sure your program have Read/Write permission of the path -/// \return -__attribute__((visibility("default"))) -void SetOpenCLParameterPath(const std::string &path); - -/// \brief Set GPU hints, currently only supports Adreno GPU. -/// -/// Caution: this function may hurt performance -/// if improper parameters provided. -/// -/// \param perf_hint performance hint -/// \param priority_hint priority hint -/// \return -__attribute__((visibility("default"))) -void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); - -/// \brief Set OpenMP threads number and affinity policy. -/// -/// Caution: this function may hurt performance if improper parameters provided. -/// When num_threads_hint is zero or negative, -/// the function will set the threads number equaling to the number of -/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all -/// (AFFINITY_NONE) cores according to the policy. The threads number will -/// also be truncated to the corresponding cores number when num_threads_hint -/// is larger than it. -/// The OpenMP threads will be bind to (via sched_setaffinity) big cores -/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY). -/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for -/// quantized inference. -/// -/// \param num_threads_hint it is only a hint. -/// \param policy one of CPUAffinityPolicy -/// \param use_gemmlowp use gemmlowp for quantized inference -/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE -/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use -/// AFFINITY_NONE to use all cores. -__attribute__((visibility("default"))) -MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp = false); - -/// \brief Set OpenMP threads number and processor affinity. -/// -/// Caution: this function may hurt performance -/// if improper parameters provided. -/// This function may not work well on some chips (e.g. MTK). Setting thread -/// affinity to offline cores may run very slow or unexpectedly. -/// In such cases, please use SetOpenMPThreadPolicy with default policy -/// instead. -/// -/// \param num_threads -/// \param cpu_ids -/// \return -__attribute__((visibility("default"))) -MaceStatus SetOpenMPThreadAffinity(int num_threads, - const std::vector &cpu_ids); - -/// \brief Get ARM big.LITTLE configuration. -/// -/// This function will detect the max frequencies of all CPU cores, and assume -/// the cores with largest max frequencies as big cores, and all the remaining -/// cores as little. If all cpu core's max frequencies equals, big_core_ids and -/// little_core_ids will both be filled with all cpu core ids. -/// -/// \param [out] big_core_ids -/// \param [out] little_core_ids -/// \return If successful, it returns MACE_SUCCESS and error if it can't -/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK). -__attribute__((visibility("default"))) -MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, - std::vector *little_core_ids); -} // namespace mace - -#endif // MACE_PUBLIC_MACE_RUNTIME_H_ diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2 index 472879365035bfe8a9ac945766dd559e94d72bf4..2bdda1439f039be6cfd88337a269f5cc83d23fa3 100644 --- a/mace/python/tools/mace_engine_factory.h.jinja2 +++ b/mace/python/tools/mace_engine_factory.h.jinja2 @@ -20,7 +20,6 @@ #include #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -57,8 +56,7 @@ std::map model_name_map { /// if model_data_format is code, just pass empty string("") /// \param input_nodes[in]: the array of input nodes' name /// \param output_nodes[in]: the array of output nodes' name -/// \param device_type[in]: one of [CPU, GPU, HEXAGON], -/// based on the runtime type of your model deployment file. +/// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// MACE_OUT_OF_RESOURCES for resources is out of range. @@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode( const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine) { // load model if (engine == nullptr) { @@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode( {% for i in range(model_tags |length) %} case {{ i }}: net_def = mace::{{model_tags[i]}}::CreateNet(); - engine->reset(new mace::MaceEngine(device_type)); + engine->reset(new mace::MaceEngine(config)); {% if embed_model_data %} model_data = mace::{{model_tags[i]}}::LoadModelData(); status = (*engine)->Init(net_def.get(), input_nodes, output_nodes, diff --git a/mace/test/BUILD b/mace/test/BUILD index 09c9e030f0376ed3d6530e3b8fb155384e3c648e..04253cda9a117cd6b7905837e8e4a09ffdd1ca21 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD @@ -1,6 +1,3 @@ -# Description: -# Mace operators. -# package( default_visibility = ["//visibility:public"], ) diff --git a/mace/test/mace_api_exception_test.cc b/mace/test/mace_api_exception_test.cc index 1eaad03726165987ce00c6df70d0b23f438a2231..7507ffc8319823554cce4d1273e023c7c87988cb 100644 --- a/mace/test/mace_api_exception_test.cc +++ b/mace/test/mace_api_exception_test.cc @@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { input_names.push_back(MakeString("input", 0)); output_names.push_back(MakeString("output", 0)); - const DeviceType device = DeviceType::GPU; + MaceEngineConfig config(DeviceType::GPU); + config.SetGPUContext( + ops::test::OpTestContext::Get()->gpu_context()); std::shared_ptr net_def(new NetDef()); for (size_t i = 0; i < input_names.size(); ++i) { @@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { info->set_name(input_names[i]); } - MaceEngine engine(device); + MaceEngine engine(config); ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr), ""); } diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index e2a09fec8d3991fd8dad65b8427ae61ea35b8c3a..6d554bbe3dbfbd88f338e2602c77ec6f86a2317d 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -18,7 +18,6 @@ #include "mace/core/operator.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" -#include "mace/public/mace_runtime.h" namespace mace { namespace test { @@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def, for (auto output : outputs) { std::unique_ptr tmp_tensor( - new Tensor(GetDeviceAllocator(DeviceType::CPU), + new Tensor(GetCPUAllocator(), DataTypeToEnum::v())); auto output_shape = output.second.shape(); const int64_t data_size = std::accumulate(output_shape.begin(), @@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) { OutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } + MaceEngineConfig config(DeviceType::GPU); - const std::string file_path ="/data/local/tmp/mace"; - std::shared_ptr storage_factory( - new FileStorageFactory(file_path)); - mace::SetKVStorageFactory(storage_factory); - - MaceEngine engine(device); + MaceEngine engine(config); MaceStatus status = engine.Init(net_def.get(), input_names, output_names, reinterpret_cast(data.data())); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); @@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) { const int thread_num = 10; std::vector threads; for (int i = 0; i < thread_num; ++i) { - threads.push_back(std::thread(MaceRunFunc, i)); + threads.push_back(std::thread(MaceRunFunc, 1)); } for (auto &t : threads) { t.join(); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 6b1f353eb8f7a3d77e59b84f23fcf3141bfef148..83d3b33dfb1894a486197af41c7344608bff6e9a 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -18,7 +18,7 @@ #include "mace/core/operator.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" -#include "mace/public/mace_runtime.h" +#include "mace/public/mace.h" namespace mace { namespace test { @@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def, } net.RunNet(net_def, D); + std::unique_ptr allocator(new CPUAllocator); for (auto output : outputs) { std::unique_ptr tmp_tensor( - new Tensor(GetDeviceAllocator(DeviceType::CPU), + new Tensor(allocator.get(), DataTypeToEnum::v())); auto output_shape = output.second.shape(); const int64_t data_size = std::accumulate(output_shape.begin(), @@ -333,7 +334,9 @@ void MaceRun(const int in_out_size, info->set_name(output_names[i]); } - MaceEngine engine(device); + MaceEngineConfig config(DeviceType::GPU); + + MaceEngine engine(config); MaceStatus status = engine.Init(net_def.get(), input_names, output_names, reinterpret_cast(data.data())); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); diff --git a/mace/tools/quantization/quantize_stat.cc b/mace/tools/quantization/quantize_stat.cc index a05f42f70621ff558d790e2f0249534f0a0271f2..936196e3065705da3788d2616e4afa79335b56d1 100644 --- a/mace/tools/quantization/quantize_stat.cc +++ b/mace/tools/quantization/quantize_stat.cc @@ -33,7 +33,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" @@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name, const std::vector> &input_shapes, const std::vector &output_names, const std::vector> &output_shapes) { - MACE_RETURN_IF_ERROR(mace::SetOpenMPThreadPolicy( - FLAGS_omp_num_threads, CPUAffinityPolicy::AFFINITY_NONE)); + // config runtime + MaceStatus status; + MaceEngineConfig config(DeviceType::CPU); + status = config.SetCPUThreadPolicy( + FLAGS_omp_num_threads, + CPUAffinityPolicy::AFFINITY_NONE); + if (status != MACE_SUCCESS) { + LOG(WARNING) << "Set openmp or cpu affinity failed."; + } std::vector model_pb_data; if (FLAGS_model_file != "") { @@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - DeviceType::CPU, + config, &engine)); #else (void) (model_name); @@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - DeviceType::CPU, + config, &engine)); #endif diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 0aeefb789c9b6720c5b7cd3814497cf2dbf22d23..3873e5dd3fa80efc6aa90e1d2fdd483a26d1f423 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -33,7 +33,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" @@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name, const std::vector> &output_shapes) { DeviceType device_type = ParseDeviceType(FLAGS_device); // config runtime - MaceStatus status = mace::SetOpenMPThreadPolicy( - FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), - true); + MaceStatus status; + MaceEngineConfig config(device_type); + status = config.SetCPUThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy), + true); if (status != MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH"); - const std::string kernel_file_path = - std::string(kernel_path == nullptr ? - "/data/local/tmp/mace_run/interior" : kernel_path); - - std::shared_ptr storage_factory( - new FileStorageFactory(kernel_file_path)); - SetKVStorageFactory(storage_factory); - std::vector model_pb_data; if (FLAGS_model_file != "") { if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) { @@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else (void)(model_name); @@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif int64_t t1 = NowMicros(); @@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif } while (create_engine_status != MACE_SUCCESS); @@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif } while (create_engine_status != MACE_SUCCESS); diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index e4007b6694a415e7a058b5f6f33a93a2ba485e8e..3295ddaec12e3703c9839e19e55414fca873dcaf 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -15,6 +15,8 @@ #ifndef MACE_UTILS_TUNER_H_ #define MACE_UTILS_TUNER_H_ #include + +#include #include #include #include @@ -29,18 +31,24 @@ namespace mace { +inline bool IsTuning() { + const char *tuning = getenv("MACE_TUNING"); + return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; +} + template class Tuner { public: - static Tuner *Get() { - static Tuner tuner; - return &tuner; + explicit Tuner(const std::string tuned_param_file_path = ""): + tuned_param_file_path_(tuned_param_file_path) { + path_ = getenv("MACE_RUN_PARAMETER_PATH"); + ReadRunParamters(); } - inline bool IsTuning() { - const char *tuning = getenv("MACE_TUNING"); - return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; - } + ~Tuner() { WriteRunParameters(); } + + Tuner(const Tuner &) = delete; + Tuner &operator=(const Tuner &) = delete; template RetType TuneOrRun( @@ -76,16 +84,6 @@ class Tuner { } private: - Tuner() { - path_ = getenv("MACE_RUN_PARAMETER_PATH"); - ReadRunParamters(); - } - - ~Tuner() { WriteRunParameters(); } - - Tuner(const Tuner &) = delete; - Tuner &operator=(const Tuner &) = delete; - inline void WriteRunParameters() { if (path_ != nullptr) { VLOG(3) << "Write tuning result to " << path_; @@ -117,9 +115,9 @@ class Tuner { } inline void ReadRunParamters() { - extern std::string kOpenCLParameterPath; - if (!kOpenCLParameterPath.empty()) { - std::ifstream ifs(kOpenCLParameterPath, std::ios::binary | std::ios::in); + if (!tuned_param_file_path_.empty()) { + std::ifstream ifs(tuned_param_file_path_, + std::ios::binary | std::ios::in); if (ifs.is_open()) { int64_t num_params = 0; ifs.read(reinterpret_cast(&num_params), sizeof(num_params)); @@ -144,7 +142,7 @@ class Tuner { LOG(WARNING) << "Read OpenCL tuned parameters file failed."; } } else { - LOG(INFO) << "There is no tuned parameters."; + VLOG(1) << "There is no tuned parameters."; } } @@ -207,6 +205,7 @@ class Tuner { } private: + std::string tuned_param_file_path_; const char *path_; std::unordered_map> param_table_; }; diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc index bd590ac90f764849f6cc91c23a829b575a5c9b68..bff02b0bd4179f25b7bb732cfd61cee6159eba79 100644 --- a/mace/utils/tuner_test.cc +++ b/mace/utils/tuner_test.cc @@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) { } }; + Tuner tuner; WallClockTimer timer; std::vector default_params(1, 1); - int res = Tuner::Get()->template TuneOrRun( + int res = tuner.TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect, res); default_params[0] = 2; - res = Tuner::Get()->template TuneOrRun( + res = tuner.TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect + 1, res); } @@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) { return {{1}, {2}, {3}, {4}}; }; // tune + Tuner tuner; WallClockTimer timer; - int res = Tuner::Get()->template TuneOrRun( + int res = tuner.TuneOrRun( "SimpleRun", default_params, *params_generator, TunerFunc, &timer); EXPECT_EQ(expect, res); // run - res = Tuner::Get()->template TuneOrRun( + res = tuner.template TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect, res); }