From f23eb74b6013285e0ae5195b3462da6813e36c06 Mon Sep 17 00:00:00 2001 From: liuqi Date: Mon, 10 Sep 2018 10:34:33 +0800 Subject: [PATCH] Refactor configuration APIs and Remove some global static variables. --- mace/benchmark/benchmark_model.cc | 49 ++-- mace/core/allocator.cc | 24 +- mace/core/allocator.h | 24 +- mace/core/arg_helper.h | 1 - mace/core/buffer.h | 4 +- mace/core/device.cc | 42 +++ mace/core/device.h | 60 +++++ mace/core/device_context.cc | 73 ++++++ mace/core/device_context.h | 47 ++++ mace/core/file_storage.cc | 43 ++- mace/core/file_storage.h | 41 ++- mace/core/net.cc | 36 ++- mace/core/net.h | 12 +- mace/core/op_kernel_context.cc | 32 +++ mace/core/op_kernel_context.h | 34 +++ mace/core/operator.cc | 13 +- mace/core/operator.h | 21 +- mace/core/registry.h | 1 - mace/core/runtime/cpu/cpu_runtime.cc | 1 - mace/core/runtime/cpu/cpu_runtime.h | 11 +- mace/core/runtime/opencl/gpu_device.cc | 44 ++++ mace/core/runtime/opencl/gpu_device.h | 44 ++++ mace/core/runtime/opencl/opencl_allocator.cc | 22 +- mace/core/runtime/opencl/opencl_allocator.h | 7 +- mace/core/runtime/opencl/opencl_runtime.cc | 65 ++--- mace/core/runtime/opencl/opencl_runtime.h | 70 ++--- mace/core/tensor.h | 9 +- mace/core/testing/test_benchmark_main.cc | 9 - mace/core/workspace.cc | 48 ++-- mace/core/workspace.h | 10 +- .../java/com/xiaomi/mace/demo/AppModel.java | 15 +- .../com/xiaomi/mace/demo/CameraActivity.java | 2 +- .../com/xiaomi/mace/demo/result/InitData.java | 14 +- .../src/main/cpp/image_classify.cc | 101 +++---- .../macelibrary/src/main/cpp/image_classify.h | 10 +- .../java/com/xiaomi/mace/JniMaceUtils.java | 4 +- mace/examples/cli/example.cc | 55 ++-- mace/kernels/activation.h | 21 +- mace/kernels/addn.h | 11 +- mace/kernels/argmax.h | 4 +- mace/kernels/arm/conv_winograd_test.cc | 8 +- mace/kernels/batch_norm.h | 24 +- mace/kernels/bias_add.h | 18 +- mace/kernels/buffer_to_image.h | 18 +- mace/kernels/channel_shuffle.h | 11 +- mace/kernels/concat.h | 12 +- mace/kernels/conv_2d.h | 35 ++- mace/kernels/crop.h | 25 +- mace/kernels/deconv_2d.h | 20 +- mace/kernels/depth_to_space.h | 17 +- mace/kernels/depthwise_conv2d.h | 28 +- mace/kernels/eltwise.h | 21 +- mace/kernels/fill.h | 5 +- mace/kernels/fully_connected.h | 25 +- mace/kernels/gather.h | 9 +- mace/kernels/gemm.cc | 8 +- mace/kernels/image_to_buffer.h | 18 +- mace/kernels/kernel.h | 31 +++ mace/kernels/local_response_norm.h | 6 +- mace/kernels/lstmcell.h | 8 +- mace/kernels/matmul.h | 12 +- mace/kernels/opencl/activation.cc | 10 +- mace/kernels/opencl/addn.cc | 6 +- mace/kernels/opencl/batch_norm.cc | 8 +- mace/kernels/opencl/bias_add.cc | 6 +- mace/kernels/opencl/buffer_to_image.cc | 4 +- mace/kernels/opencl/channel_shuffle.cc | 8 +- mace/kernels/opencl/concat.cc | 36 +-- mace/kernels/opencl/conv_2d.cc | 20 +- mace/kernels/opencl/conv_2d_1x1.cc | 19 +- mace/kernels/opencl/conv_2d_3x3.cc | 19 +- mace/kernels/opencl/conv_2d_general.cc | 18 +- mace/kernels/opencl/crop.cc | 14 +- mace/kernels/opencl/deconv_2d.cc | 18 +- mace/kernels/opencl/depth_to_space.cc | 8 +- mace/kernels/opencl/depthwise_conv.cc | 30 +-- mace/kernels/opencl/eltwise.cc | 8 +- mace/kernels/opencl/fully_connected.cc | 19 +- mace/kernels/opencl/helper.cc | 28 +- mace/kernels/opencl/helper.h | 13 +- mace/kernels/opencl/image_to_buffer.cc | 4 +- mace/kernels/opencl/lstmcell.cc | 6 +- mace/kernels/opencl/matmul.cc | 6 +- .../kernels/opencl/out_of_range_check_test.cc | 27 +- mace/kernels/opencl/pad.cc | 8 +- mace/kernels/opencl/pooling.cc | 14 +- mace/kernels/opencl/reduce_mean.cc | 4 +- mace/kernels/opencl/resize_bicubic.cc | 16 +- mace/kernels/opencl/resize_bilinear.cc | 14 +- mace/kernels/opencl/softmax.cc | 14 +- mace/kernels/opencl/space_to_batch.cc | 8 +- mace/kernels/opencl/split.cc | 6 +- mace/kernels/opencl/winograd_transform.cc | 12 +- mace/kernels/pad.h | 20 +- mace/kernels/pooling.h | 46 +++- mace/kernels/proposal.h | 7 +- mace/kernels/quantize.h | 9 +- mace/kernels/reduce_mean.h | 21 +- mace/kernels/reshape.h | 9 +- mace/kernels/resize_bicubic.h | 22 +- mace/kernels/resize_bilinear.h | 21 +- mace/kernels/scalar_math.h | 14 +- mace/kernels/sgemm.h | 2 +- mace/kernels/softmax.h | 10 +- mace/kernels/space_to_batch.h | 20 +- mace/kernels/split.h | 16 +- mace/kernels/stack.h | 6 +- mace/kernels/strided_slice.h | 11 +- mace/kernels/transpose.h | 5 +- mace/kernels/unstack.h | 6 +- mace/kernels/winograd_transform.h | 42 ++- mace/libmace/mace.cc | 247 ++++++++++++++++-- mace/libmace/mace_runtime.cc | 113 -------- mace/libmace/mace_version_script.lds | 9 +- mace/ops/BUILD | 20 +- mace/ops/activation.h | 7 +- mace/ops/activation_test.cc | 14 +- mace/ops/addn.h | 4 +- mace/ops/addn_test.cc | 10 +- mace/ops/argmax.h | 4 +- mace/ops/argmax_test.cc | 2 +- mace/ops/batch_norm.h | 9 +- mace/ops/batch_norm_test.cc | 30 ++- mace/ops/batch_to_space.h | 7 +- mace/ops/bias_add.h | 9 +- mace/ops/bias_add_test.cc | 14 +- mace/ops/buffer_to_image.h | 7 +- mace/ops/cast.h | 4 +- mace/ops/channel_shuffle.h | 6 +- mace/ops/channel_shuffle_test.cc | 4 +- mace/ops/concat.h | 6 +- mace/ops/conv_2d.h | 9 +- mace/ops/conv_2d_test.cc | 46 ++-- mace/ops/conv_pool_2d_base.h | 4 +- mace/ops/core_test.cc | 8 +- mace/ops/crop.h | 7 +- mace/ops/crop_test.cc | 2 +- mace/ops/deconv_2d.h | 7 +- mace/ops/deconv_2d_test.cc | 8 +- mace/ops/depth_to_space.h | 6 +- mace/ops/depth_to_space_test.cc | 2 +- mace/ops/depthwise_conv2d.h | 7 +- mace/ops/depthwise_conv2d_test.cc | 14 +- mace/ops/eltwise.h | 5 +- mace/ops/eltwise_test.cc | 24 +- mace/ops/fill.h | 6 +- mace/ops/folded_batch_norm.h | 7 +- mace/ops/folded_batch_norm_test.cc | 30 ++- mace/ops/fully_connected.h | 9 +- mace/ops/fully_connected_test.cc | 10 +- mace/ops/gather.h | 7 +- mace/ops/gather_test.cc | 2 +- mace/ops/identity.h | 4 +- mace/ops/image_to_buffer.h | 7 +- mace/ops/infer_conv2d_shape.h | 4 +- mace/ops/local_response_norm.h | 4 +- mace/ops/local_response_norm_test.cc | 2 +- mace/ops/lstmcell.h | 10 +- mace/ops/matmul.h | 8 +- mace/ops/matmul_test.cc | 12 +- mace/ops/ops_test_util.cc | 44 ++++ mace/ops/ops_test_util.h | 114 +++++--- mace/ops/pad.h | 7 +- mace/ops/pad_test.cc | 12 +- mace/ops/pooling.h | 7 +- mace/ops/pooling_test.cc | 36 +-- mace/ops/proposal.h | 7 +- mace/ops/proposal_test.cc | 3 +- mace/ops/quantize.h | 9 +- mace/ops/reduce_mean.h | 7 +- mace/ops/reduce_mean_test.cc | 2 +- mace/ops/reshape.h | 4 +- mace/ops/resize_bicubic.h | 7 +- mace/ops/resize_bicubic_test.cc | 6 +- mace/ops/resize_bilinear.h | 7 +- mace/ops/resize_bilinear_test.cc | 10 +- mace/ops/scalar_math.h | 7 +- mace/ops/scalar_math_test.cc | 90 +++---- mace/ops/shape.h | 4 +- mace/ops/softmax.h | 5 +- mace/ops/softmax_test.cc | 8 +- mace/ops/space_to_batch.h | 7 +- mace/ops/space_to_batch_test.cc | 11 +- mace/ops/space_to_depth.h | 8 +- mace/ops/split.h | 6 +- mace/ops/squeeze.h | 4 +- mace/ops/stack.h | 6 +- mace/ops/strided_slice.h | 7 +- mace/ops/transpose.h | 6 +- mace/ops/unstack.h | 6 +- mace/ops/winograd_convolution_test.cc | 25 +- mace/ops/winograd_inverse_transform.h | 8 +- mace/ops/winograd_transform.h | 7 +- mace/public/BUILD | 1 - mace/public/mace.h | 195 +++++++++++++- mace/public/mace_runtime.h | 186 ------------- .../python/tools/mace_engine_factory.h.jinja2 | 8 +- mace/test/BUILD | 3 - mace/test/mace_api_exception_test.cc | 6 +- mace/test/mace_api_mt_test.cc | 13 +- mace/test/mace_api_test.cc | 9 +- mace/tools/quantization/quantize_stat.cc | 16 +- mace/tools/validation/mace_run.cc | 53 ++-- mace/utils/tuner.h | 41 ++- mace/utils/tuner_test.cc | 10 +- 205 files changed, 2483 insertions(+), 1563 deletions(-) create mode 100644 mace/core/device.cc create mode 100644 mace/core/device.h create mode 100644 mace/core/device_context.cc create mode 100644 mace/core/device_context.h create mode 100644 mace/core/op_kernel_context.cc create mode 100644 mace/core/op_kernel_context.h create mode 100644 mace/core/runtime/opencl/gpu_device.cc create mode 100644 mace/core/runtime/opencl/gpu_device.h create mode 100644 mace/kernels/kernel.h delete mode 100644 mace/libmace/mace_runtime.cc create mode 100644 mace/ops/ops_test_util.cc delete mode 100644 mace/public/mace_runtime.h diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 9a689f45..26fb2d0b 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -22,7 +22,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" #include "mace/benchmark/statistics.h" @@ -257,36 +256,40 @@ int Main(int argc, char **argv) { mace::DeviceType device_type = ParseDeviceType(FLAGS_device); - // config runtime - MaceStatus ret = mace::SetOpenMPThreadPolicy( + // configuration + MaceStatus mace_status; + MaceEngineConfig config(device_type); + mace_status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), + static_cast(FLAGS_cpu_affinity_policy), true); - if (ret != MACE_SUCCESS) { - LOG(WARNING) << "Set openmp or cpu affinity failed."; + if (mace_status != MACE_SUCCESS) { + LOG(INFO) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH"); - const std::string kernel_file_path = - std::string(kernel_path == nullptr ? - "/data/local/tmp/mace_run/interior" : kernel_path); - - std::shared_ptr storage_factory( - new FileStorageFactory(kernel_file_path)); - SetKVStorageFactory(storage_factory); - // Create Engine std::shared_ptr engine; MaceStatus create_engine_status; @@ -306,7 +309,7 @@ int Main(int argc, char **argv) { model_data_file_ptr, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -314,7 +317,7 @@ int Main(int argc, char **argv) { model_data_file_ptr, input_names, output_names, - device_type, + config, &engine); #endif if (create_engine_status != MaceStatus::MACE_SUCCESS) { diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc index 07776bc1..d9b5c3c2 100644 --- a/mace/core/allocator.cc +++ b/mace/core/allocator.cc @@ -13,30 +13,12 @@ // limitations under the License. #include "mace/core/allocator.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_allocator.h" -#endif namespace mace { -std::map *gAllocatorRegistry() { - static std::map g_allocator_registry; - return &g_allocator_registry; +Allocator *GetCPUAllocator() { + static CPUAllocator allocator; + return &allocator; } -Allocator *GetDeviceAllocator(DeviceType type) { - auto iter = gAllocatorRegistry()->find(type); - if (iter == gAllocatorRegistry()->end()) { - LOG(ERROR) << "Allocator not found for device " << type; - return nullptr; - } - return iter->second; -} - -MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator()); -#ifdef MACE_ENABLE_OPENCL -MACE_REGISTER_ALLOCATOR(DeviceType::GPU, new OpenCLAllocator()); -#endif -MACE_REGISTER_ALLOCATOR(DeviceType::HEXAGON, new CPUAllocator()); - } // namespace mace diff --git a/mace/core/allocator.h b/mace/core/allocator.h index a212e7f9..51f04741 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -26,8 +26,6 @@ #include "mace/core/registry.h" #include "mace/core/types.h" #include "mace/core/runtime_failure_mock.h" -#include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -138,26 +136,8 @@ class CPUAllocator : public Allocator { bool OnHost() const override { return true; } }; -std::map *gAllocatorRegistry(); - -Allocator *GetDeviceAllocator(DeviceType type); - -struct AllocatorRegisterer { - explicit AllocatorRegisterer(DeviceType type, Allocator *alloc) { - if (gAllocatorRegistry()->count(type)) { - LOG(ERROR) << "Allocator for device type " << type - << " registered twice. This should not happen." - << gAllocatorRegistry()->count(type); - std::exit(1); - } - gAllocatorRegistry()->emplace(type, alloc); - } -}; - -#define MACE_REGISTER_ALLOCATOR(type, alloc) \ - namespace { \ - static AllocatorRegisterer MACE_ANONYMOUS_VARIABLE(Allocator)(type, alloc); \ - } +// Global CPU allocator used for CPU/GPU/DSP +Allocator *GetCPUAllocator(); } // namespace mace diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h index 3e1cca93..50ec4ead 100644 --- a/mace/core/arg_helper.h +++ b/mace/core/arg_helper.h @@ -20,7 +20,6 @@ #include #include "mace/proto/mace.pb.h" -#include "mace/public/mace.h" namespace mace { diff --git a/mace/core/buffer.h b/mace/core/buffer.h index b349cf4b..c57a1714 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -218,9 +218,9 @@ class Buffer : public BufferBase { class Image : public BufferBase { public: - Image() + explicit Image(Allocator *allocator) : BufferBase(0), - allocator_(GetDeviceAllocator(GPU)), + allocator_(allocator), buf_(nullptr), mapped_buf_(nullptr) {} diff --git a/mace/core/device.cc b/mace/core/device.cc new file mode 100644 index 00000000..09f5a068 --- /dev/null +++ b/mace/core/device.cc @@ -0,0 +1,42 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/device.h" + +namespace mace { + +CPUDevice::CPUDevice(const int num_threads) + : cpu_runtime_(new CPURuntime(num_threads)) {} + +CPUDevice::~CPUDevice() = default; + +CPURuntime *CPUDevice::cpu_runtime() { + return cpu_runtime_.get(); +} + +#ifdef MACE_ENABLE_OPENCL +OpenCLRuntime *CPUDevice::opencl_runtime() { + return nullptr; +} +#endif + +Allocator *CPUDevice::allocator() { + return GetCPUAllocator(); +} + +DeviceType CPUDevice::device_type() const { + return DeviceType::CPU; +} + +} // namespace mace diff --git a/mace/core/device.h b/mace/core/device.h new file mode 100644 index 00000000..7336d79f --- /dev/null +++ b/mace/core/device.h @@ -0,0 +1,60 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_DEVICE_H_ +#define MACE_CORE_DEVICE_H_ + +#include + +#include "mace/core/runtime/cpu/cpu_runtime.h" +#include "mace/core/allocator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_runtime.h" +#endif + +namespace mace { + +class Device { + public: + virtual ~Device() {} + +#ifdef MACE_ENABLE_OPENCL + virtual OpenCLRuntime *opencl_runtime() = 0; +#endif + virtual CPURuntime *cpu_runtime() = 0; + + virtual Allocator *allocator() = 0; + virtual DeviceType device_type() const = 0; +}; + +class CPUDevice : public Device { + public: + explicit CPUDevice(const int num_threads); + virtual ~CPUDevice(); + +#ifdef MACE_ENABLE_OPENCL + OpenCLRuntime *opencl_runtime() override; +#endif + CPURuntime *cpu_runtime() override; + + Allocator *allocator() override; + DeviceType device_type() const override; + + private: + std::unique_ptr cpu_runtime_; +}; + +} // namespace mace +#endif // MACE_CORE_DEVICE_H_ diff --git a/mace/core/device_context.cc b/mace/core/device_context.cc new file mode 100644 index 00000000..88a965fa --- /dev/null +++ b/mace/core/device_context.cc @@ -0,0 +1,73 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/device_context.h" + +#include + +namespace mace { + +namespace { + +const char *kPrecompiledProgramFileName = "mace_cl_compiled_program.bin"; + +std::string FindFirstExistPath(const std::vector &paths) { + std::string result; + struct stat st; + for (auto path : paths) { + if (stat(path.c_str(), &st) == 0) { + if (S_ISREG(st.st_mode)) { + result = path; + break; + } + } + } + return result; +} +} // namespace + +GPUContext::GPUContext(const std::string &storage_path, + const std::vector &opencl_binary_paths, + const std::string &opencl_parameter_path) + : storage_factory_(new FileStorageFactory(storage_path)), + opencl_tuner_(new Tuner(opencl_parameter_path)) { + + if (!storage_path.empty()) { + opencl_cache_storage_ = + storage_factory_->CreateStorage(kPrecompiledProgramFileName); + } + + std::string precompiled_binary_path = + FindFirstExistPath(opencl_binary_paths); + if (!precompiled_binary_path.empty()) { + opencl_binary_storage_.reset( + new FileStorage(precompiled_binary_path)); + } +} + +GPUContext::~GPUContext() = default; + +KVStorage *GPUContext::opencl_binary_storage() { + return opencl_binary_storage_.get(); +} + +KVStorage *GPUContext::opencl_cache_storage() { + return opencl_cache_storage_.get(); +} + +Tuner *GPUContext::opencl_tuner() { + return opencl_tuner_.get(); +} + +} // namespace mace diff --git a/mace/core/device_context.h b/mace/core/device_context.h new file mode 100644 index 00000000..21d07673 --- /dev/null +++ b/mace/core/device_context.h @@ -0,0 +1,47 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_DEVICE_CONTEXT_H_ +#define MACE_CORE_DEVICE_CONTEXT_H_ + +#include +#include +#include +#include + +#include "mace/core/file_storage.h" +#include "mace/utils/tuner.h" + +namespace mace { + +class GPUContext { + public: + GPUContext(const std::string &storage_path = "", + const std::vector &opencl_binary_path = {}, + const std::string &opencl_parameter_path = ""); + ~GPUContext(); + + KVStorage *opencl_binary_storage(); + KVStorage *opencl_cache_storage(); + Tuner *opencl_tuner(); + + private: + std::unique_ptr storage_factory_; + std::unique_ptr> opencl_tuner_; + std::unique_ptr opencl_binary_storage_; + std::unique_ptr opencl_cache_storage_; +}; + +} // namespace mace +#endif // MACE_CORE_DEVICE_CONTEXT_H_ diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc index 99731a81..7c1fb35b 100644 --- a/mace/core/file_storage.cc +++ b/mace/core/file_storage.cc @@ -28,10 +28,36 @@ namespace mace { -std::shared_ptr kStorageFactory = nullptr; +class FileStorageFactory::Impl { + public: + explicit Impl(const std::string &path); + + std::unique_ptr CreateStorage(const std::string &name); + + private: + std::string path_; +}; + +FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {} + +std::unique_ptr FileStorageFactory::Impl::CreateStorage( + const std::string &name) { + return std::move(std::unique_ptr( + new FileStorage(path_ + "/" + name))); +} + +FileStorageFactory::FileStorageFactory(const std::string &path): + impl_(new FileStorageFactory::Impl(path)) {} + +FileStorageFactory::~FileStorageFactory() = default; + +std::unique_ptr FileStorageFactory::CreateStorage( + const std::string &name) { + return impl_->CreateStorage(name); +} FileStorage::FileStorage(const std::string &file_path): - data_changed_(false), file_path_(file_path) {} + loaded_(false), data_changed_(false), file_path_(file_path) {} int FileStorage::Load() { struct stat st; @@ -47,6 +73,9 @@ int FileStorage::Load() { } } utils::WriteLock lock(&data_mutex_); + if (loaded_) { + return 0; + } int fd = open(file_path_.c_str(), O_RDONLY); if (fd < 0) { if (errno == ENOENT) { @@ -118,13 +147,17 @@ int FileStorage::Load() { << " failed, error code: " << strerror(errno); return -1; } + loaded_ = true; return 0; } -void FileStorage::Clear() { +bool FileStorage::Clear() { utils::WriteLock lock(&data_mutex_); - data_.clear(); - data_changed_ = true; + if (!data_.empty()) { + data_.clear(); + data_changed_ = true; + } + return true; } bool FileStorage::Insert(const std::string &key, diff --git a/mace/core/file_storage.h b/mace/core/file_storage.h index 3b648c23..c4efe8c3 100644 --- a/mace/core/file_storage.h +++ b/mace/core/file_storage.h @@ -16,27 +16,64 @@ #define MACE_CORE_FILE_STORAGE_H_ #include +#include #include #include -#include "mace/public/mace_runtime.h" +#include "mace/public/mace.h" #include "mace/utils/rwlock.h" namespace mace { +class KVStorage { + public: + // return: 0 for success, -1 for error + virtual int Load() = 0; + virtual bool Clear() = 0; + // insert or update the key-value. + virtual bool Insert(const std::string &key, + const std::vector &value) = 0; + virtual const std::vector *Find(const std::string &key) = 0; + // return: 0 for success, -1 for error + virtual int Flush() = 0; + virtual ~KVStorage() {} +}; + +class KVStorageFactory { + public: + virtual std::unique_ptr CreateStorage(const std::string &name) = 0; + + virtual ~KVStorageFactory() {} +}; + +class FileStorageFactory : public KVStorageFactory { + public: + // You have to make sure your APP have read and write permission of the path. + explicit FileStorageFactory(const std::string &path); + + ~FileStorageFactory(); + + std::unique_ptr CreateStorage(const std::string &name) override; + + private: + class Impl; + std::unique_ptr impl_; +}; + class FileStorage : public KVStorage { public: explicit FileStorage(const std::string &file_path); public: int Load() override; - void Clear() override; + bool Clear() override; bool Insert(const std::string &key, const std::vector &value) override; const std::vector *Find(const std::string &key) override; int Flush() override; private: + bool loaded_; bool data_changed_; std::string file_path_; std::map> data_; diff --git a/mace/core/net.cc b/mace/core/net.cc index ec8afdd1..0c538b80 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -18,6 +18,7 @@ #include "mace/core/macros.h" #include "mace/core/net.h" +#include "mace/public/mace.h" #include "mace/utils/memory_logging.h" #include "mace/utils/timer.h" #include "mace/utils/utils.h" @@ -27,30 +28,35 @@ namespace mace { NetBase::NetBase(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type) + Device *device) : name_(net_def->name()), op_registry_(op_registry) { MACE_UNUSED(ws); - MACE_UNUSED(type); + MACE_UNUSED(device); } SerialNet::SerialNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) - : NetBase(op_registry, net_def, ws, type), device_type_(type) { + : NetBase(op_registry, net_def, ws, device), device_(device), + op_kernel_context_(new OpKernelContext(ws, device)) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); + DeviceType device_type = device->device_type(); for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto &operator_def = net_def->op(idx); // TODO(liuqi): refactor to add device_type to OperatorDef const int op_device = ProtoArgHelper::GetOptionalArg( - operator_def, "device", static_cast(device_type_)); - if (op_device == type) { + operator_def, "device", static_cast(device_type)); + if (op_device == device_type) { + VLOG(3) << "Creating operator " << operator_def.name() << "(" + << operator_def.type() << ")"; OperatorDef temp_def(operator_def); std::unique_ptr op( - op_registry->CreateOperator(temp_def, ws, type, mode)); + op_registry->CreateOperator(temp_def, op_kernel_context_.get(), + device_type, mode)); if (op) { operators_.emplace_back(std::move(op)); } @@ -61,13 +67,14 @@ SerialNet::SerialNet( MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); + const DeviceType device_type = device_->device_type(); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", op->debug_def().type(), "), mem_id: ", MakeListString(op->debug_def().mem_id().data(), op->debug_def().mem_id().size())); - bool future_wait = (device_type_ == DeviceType::GPU && + bool future_wait = (device_type == DeviceType::GPU && (run_metadata != nullptr || std::distance(iter, operators_.end()) == 1)); @@ -80,6 +87,9 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { } else { future.wait_fn(nullptr); } +#ifdef MACE_ENABLE_OPENCL + device_->opencl_runtime()->command_queue().finish(); +#endif } else if (run_metadata != nullptr) { call_stats.start_micros = NowMicros(); MACE_RETURN_IF_ERROR(op->Run(nullptr)); @@ -125,7 +135,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { VLOG(3) << "Operator " << op->debug_def().name() << " has shape: " << MakeString(op->Output(0)->shape()); - if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type_ == CPU) { + if (EnvEnabled("MACE_LOG_TENSOR_RANGE") && device_type == CPU) { for (int i = 0; i < op->OutputSize(); ++i) { int data_type = op->GetOptionalArg("T", static_cast(DT_FLOAT)); if (data_type == static_cast(DT_FLOAT)) { @@ -151,20 +161,20 @@ std::unique_ptr CreateNet( const std::shared_ptr op_registry, const NetDef &net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) { std::shared_ptr tmp_net_def(new NetDef(net_def)); - return CreateNet(op_registry, tmp_net_def, ws, type, mode); + return CreateNet(op_registry, tmp_net_def, ws, device, mode); } std::unique_ptr CreateNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode) { std::unique_ptr net( - new SerialNet(op_registry, net_def, ws, type, mode)); + new SerialNet(op_registry, net_def, ws, device, mode)); return net; } diff --git a/mace/core/net.h b/mace/core/net.h index 0cec4059..a63ded66 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -20,7 +20,6 @@ #include #include "mace/core/operator.h" -#include "mace/public/mace.h" namespace mace { @@ -33,7 +32,7 @@ class NetBase { NetBase(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type); + Device *device); virtual ~NetBase() noexcept {} virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0; @@ -52,14 +51,15 @@ class SerialNet : public NetBase { SerialNet(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); MaceStatus Run(RunMetadata *run_metadata = nullptr) override; protected: std::vector > operators_; - DeviceType device_type_; + Device *device_; + std::unique_ptr op_kernel_context_; MACE_DISABLE_COPY_AND_ASSIGN(SerialNet); }; @@ -68,13 +68,13 @@ std::unique_ptr CreateNet( const std::shared_ptr op_registry, const NetDef &net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); std::unique_ptr CreateNet( const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, - DeviceType type, + Device *device, const NetMode mode = NetMode::NORMAL); } // namespace mace diff --git a/mace/core/op_kernel_context.cc b/mace/core/op_kernel_context.cc new file mode 100644 index 00000000..20f9e561 --- /dev/null +++ b/mace/core/op_kernel_context.cc @@ -0,0 +1,32 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/op_kernel_context.h" + +namespace mace { + +OpKernelContext::OpKernelContext(Workspace *ws, Device *device) + : device_(device), ws_(ws) {} + +OpKernelContext::~OpKernelContext() = default; + +Device* OpKernelContext::device() { + return device_; +} + +Workspace* OpKernelContext::workspace() { + return ws_; +} + +} // namespace mace diff --git a/mace/core/op_kernel_context.h b/mace/core/op_kernel_context.h new file mode 100644 index 00000000..fe5e777c --- /dev/null +++ b/mace/core/op_kernel_context.h @@ -0,0 +1,34 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_ +#define MACE_CORE_OP_KERNEL_CONTEXT_H_ + +#include "mace/core/device.h" +#include "mace/core/workspace.h" +namespace mace { + +class OpKernelContext { + public: + OpKernelContext(Workspace *ws, Device *device); + ~OpKernelContext(); + Device *device(); + Workspace *workspace(); + private: + Device *device_; + Workspace *ws_; +}; + +} // namespace mace +#endif // MACE_CORE_OP_KERNEL_CONTEXT_H_ diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 20769fa3..5e404835 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -18,12 +18,15 @@ #include #include "mace/core/operator.h" +#include "mace/core/op_kernel_context.h" namespace mace { -OperatorBase::OperatorBase(const OperatorDef &operator_def, Workspace *ws) - : operator_ws_(ws), - operator_def_(std::make_shared(operator_def)) {} +OperatorBase::OperatorBase(const OperatorDef &operator_def, + OpKernelContext *context) + : operator_def_(std::make_shared(operator_def)) { + MACE_UNUSED(context); +} OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {} @@ -54,7 +57,7 @@ OperatorRegistryBase::~OperatorRegistryBase() {} std::unique_ptr OperatorRegistryBase::CreateOperator( const OperatorDef &operator_def, - Workspace *ws, + OpKernelContext *context, DeviceType type, const NetMode mode) const { const int dtype = ProtoArgHelper::GetOptionalArg( @@ -70,7 +73,7 @@ std::unique_ptr OperatorRegistryBase::CreateOperator( .Device(type) .TypeConstraint("T", static_cast(dtype)) .Build(), - operator_def, ws); + operator_def, context); } else { return nullptr; } diff --git a/mace/core/operator.h b/mace/core/operator.h index 330f8002..6be38890 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -22,17 +22,17 @@ #include "mace/core/arg_helper.h" #include "mace/core/future.h" +#include "mace/core/op_kernel_context.h" #include "mace/core/registry.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/proto/mace.pb.h" -#include "mace/public/mace.h" namespace mace { class OperatorBase { public: - explicit OperatorBase(const OperatorDef &operator_def, Workspace *ws); + explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *); virtual ~OperatorBase() noexcept {} template @@ -78,7 +78,6 @@ class OperatorBase { inline bool has_debug_def() const { return operator_def_ != nullptr; } protected: - Workspace *operator_ws_; std::shared_ptr operator_def_; std::vector inputs_; std::vector outputs_; @@ -89,8 +88,9 @@ class OperatorBase { template class Operator : public OperatorBase { public: - explicit Operator(const OperatorDef &operator_def, Workspace *ws) - : OperatorBase(operator_def, ws) { + explicit Operator(const OperatorDef &operator_def, OpKernelContext *context) + : OperatorBase(operator_def, context) { + Workspace *ws = context->workspace(); for (const std::string &input_str : operator_def.input()) { const Tensor *tensor = ws->GetTensor(input_str); MACE_CHECK(tensor != nullptr, "op ", operator_def.type(), @@ -116,7 +116,7 @@ class Operator : public OperatorBase { output_type = DataTypeToEnum::v(); } outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( - output_str, GetDeviceAllocator(D), output_type))); + output_str, context->device()->allocator(), output_type))); } } } @@ -165,13 +165,16 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { class OperatorRegistryBase { public: - typedef Registry + typedef Registry RegistryType; OperatorRegistryBase() = default; virtual ~OperatorRegistryBase(); RegistryType *registry() { return ®istry_; } std::unique_ptr CreateOperator(const OperatorDef &operator_def, - Workspace *ws, + OpKernelContext *context, DeviceType type, const NetMode mode) const; @@ -183,7 +186,7 @@ class OperatorRegistryBase { MACE_DECLARE_REGISTRY(OpRegistry, OperatorBase, const OperatorDef &, - Workspace *); + OpKernelContext *); #define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) diff --git a/mace/core/registry.h b/mace/core/registry.h index ac813287..1ad92f0a 100644 --- a/mace/core/registry.h +++ b/mace/core/registry.h @@ -22,7 +22,6 @@ #include #include -#include "mace/public/mace.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index f9b1d49f..671d4cdf 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -30,7 +30,6 @@ #include "public/gemmlowp.h" #include "mace/core/macros.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" namespace mace { diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 3382a8f1..83d397ee 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -18,7 +18,6 @@ #include #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -34,6 +33,16 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, CPUAffinityPolicy policy, bool use_gemmlowp = false); +class CPURuntime { + public: + explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {} + ~CPURuntime() = default; + inline int num_threads() const { + return num_threads_; + } + private: + int num_threads_; +}; } // namespace mace #endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc new file mode 100644 index 00000000..cd9e41bb --- /dev/null +++ b/mace/core/runtime/opencl/gpu_device.cc @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/gpu_device.h" + +namespace mace { + +GPUDevice::GPUDevice(Tuner *tuner, + KVStorage *opencl_cache_storage, + const GPUPriorityHint priority, + const GPUPerfHint perf, + KVStorage *opencl_binary_storage, + const int num_threads) : + CPUDevice(num_threads), + runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, + opencl_binary_storage, tuner)), + allocator_(new OpenCLAllocator(runtime_.get())) {} + +GPUDevice::~GPUDevice() = default; + +OpenCLRuntime* GPUDevice::opencl_runtime() { + return runtime_.get(); +} + +Allocator* GPUDevice::allocator() { + return allocator_.get(); +} + +DeviceType GPUDevice::device_type() const { + return DeviceType::GPU; +} + +} // namespace mace diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h new file mode 100644 index 00000000..1526ba0a --- /dev/null +++ b/mace/core/runtime/opencl/gpu_device.h @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ +#define MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ + +#include + +#include "mace/core/device_context.h" +#include "mace/core/device.h" +#include "mace/core/runtime/opencl/opencl_allocator.h" + +namespace mace { + +class GPUDevice : public CPUDevice { + public: + GPUDevice(Tuner *tuner, + KVStorage *opencl_cache_storage = nullptr, + const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW, + const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL, + KVStorage *opencl_binary_storage = nullptr, + const int num_threads = -1); + ~GPUDevice(); + OpenCLRuntime *opencl_runtime() override; + Allocator *allocator() override; + DeviceType device_type() const override; + private: + std::unique_ptr runtime_; + std::unique_ptr allocator_; +}; + +} // namespace mace +#endif // MACE_CORE_RUNTIME_OPENCL_GPU_DEVICE_H_ diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index 86b0138d..c22e4f8f 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "mace/core/runtime/opencl/opencl_allocator.h" -#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { @@ -37,7 +38,9 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { } } // namespace -OpenCLAllocator::OpenCLAllocator() {} +OpenCLAllocator::OpenCLAllocator( + OpenCLRuntime *opencl_runtime): + opencl_runtime_(opencl_runtime) {} OpenCLAllocator::~OpenCLAllocator() {} MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { @@ -51,7 +54,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { } cl_int error; - cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), + cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, nbytes, nullptr, &error); if (error != CL_SUCCESS) { @@ -82,7 +85,7 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector &image_shape, cl_int error; cl::Image2D *cl_image = - new cl::Image2D(OpenCLRuntime::Global()->context(), + new cl::Image2D(opencl_runtime_->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format, image_shape[0], image_shape[1], 0, nullptr, &error); if (error != CL_SUCCESS) { @@ -116,8 +119,9 @@ void OpenCLAllocator::DeleteImage(void *buffer) const { } void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { + VLOG(3) << "Map OpenCL buffer"; auto cl_buffer = static_cast(buffer); - auto queue = OpenCLRuntime::Global()->command_queue(); + auto queue = opencl_runtime_->command_queue(); // TODO(heliangliang) Non-blocking call cl_int error; void *mapped_ptr = @@ -134,14 +138,15 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { void *OpenCLAllocator::MapImage(void *buffer, const std::vector &image_shape, std::vector *mapped_image_pitch) const { - MACE_CHECK(image_shape.size() == 2, "Just support map 2d image"); + VLOG(3) << "Map OpenCL Image"; + MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; auto cl_image = static_cast(buffer); std::array origin = {0, 0, 0}; std::array region = {image_shape[0], image_shape[1], 1}; mapped_image_pitch->resize(2); cl_int error; - void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage( + void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage( *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr, nullptr, &error); @@ -153,8 +158,9 @@ void *OpenCLAllocator::MapImage(void *buffer, } void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { + VLOG(3) << "Unmap OpenCL buffer/Image"; auto cl_buffer = static_cast(buffer); - auto queue = OpenCLRuntime::Global()->command_queue(); + auto queue = opencl_runtime_->command_queue(); cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr, nullptr); if (error != CL_SUCCESS) { diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h index 6304add8..d2b7556b 100644 --- a/mace/core/runtime/opencl/opencl_allocator.h +++ b/mace/core/runtime/opencl/opencl_allocator.h @@ -15,15 +15,17 @@ #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_ +#include #include #include "mace/core/allocator.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { class OpenCLAllocator : public Allocator { public: - OpenCLAllocator(); + explicit OpenCLAllocator(OpenCLRuntime *opencl_runtime); ~OpenCLAllocator() override; @@ -51,6 +53,9 @@ class OpenCLAllocator : public Allocator { void Unmap(void *buffer, void *mapped_ptr) const override; bool OnHost() const override; + + private: + OpenCLRuntime *opencl_runtime_; }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 30533703..967a040f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -24,11 +24,9 @@ #include #include -#include "mace/public/mace_runtime.h" #include "mace/core/macros.h" #include "mace/core/file_storage.h" #include "mace/core/runtime/opencl/opencl_extension.h" -#include "mace/public/mace.h" #include "mace/utils/tuner.h" namespace mace { @@ -249,14 +247,12 @@ std::string FindFirstExistPath(const std::vector &paths) { const char *kOpenCLPlatformInfoKey = "mace_opencl_precompiled_platform_info_key"; -const char *kPrecompiledProgramFileName = - "mace_cl_compiled_program.bin"; } // namespace void OpenCLProfilingTimer::StartTiming() {} void OpenCLProfilingTimer::StopTiming() { - OpenCLRuntime::Global()->command_queue().finish(); + runtime_->command_queue().finish(); start_nanos_ = event_->getProfilingInfo(); stop_nanos_ = event_->getProfilingInfo(); } @@ -278,35 +274,15 @@ void OpenCLProfilingTimer::ClearTiming() { accumulated_micros_ = 0; } -GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL; -GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = - GPUPriorityHint::PRIORITY_DEFAULT; -std::string - OpenCLRuntime::kPrecompiledBinaryPath = ""; // NOLINT(runtime/string) - -OpenCLRuntime *OpenCLRuntime::Global() { - static OpenCLRuntime runtime; - return &runtime; -} - -void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint, - GPUPriorityHint gpu_priority_hint) { - OpenCLRuntime::kGPUPerfHint = gpu_perf_hint; - OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint; -} - -void OpenCLRuntime::ConfigureOpenCLBinaryPath( - const std::vector &paths) { - OpenCLRuntime::kPrecompiledBinaryPath = FindFirstExistPath(paths); - if (OpenCLRuntime::kPrecompiledBinaryPath.empty()) { - LOG(WARNING) << "There is no precompiled OpenCL binary file in " - << MakeString(paths); - } -} - -OpenCLRuntime::OpenCLRuntime(): - precompiled_binary_storage_(nullptr), - cache_storage_(nullptr), +OpenCLRuntime::OpenCLRuntime( + KVStorage *cache_storage, + const GPUPriorityHint priority_hint, + const GPUPerfHint perf_hint, + KVStorage *precompiled_binary_storage, + Tuner *tuner): + cache_storage_(cache_storage), + precompiled_binary_storage_(precompiled_binary_storage), + tuner_(tuner), is_opencl_avaliable_(false), is_profiling_enabled_(false), opencl_version_(CL_VER_UNKNOWN), @@ -362,7 +338,7 @@ OpenCLRuntime::OpenCLRuntime(): cl_command_queue_properties properties = 0; const char *profiling = getenv("MACE_OPENCL_PROFILING"); - if (Tuner::Get()->IsTuning() || + if (IsTuning() || (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { properties |= CL_QUEUE_PROFILING_ENABLE; is_profiling_enabled_ = true; @@ -374,8 +350,8 @@ OpenCLRuntime::OpenCLRuntime(): std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, - OpenCLRuntime::kGPUPerfHint, - OpenCLRuntime::kGPUPriorityHint); + perf_hint, + priority_hint); context_ = std::shared_ptr( new cl::Context({*device_}, context_properties.data(), nullptr, nullptr, &err)); @@ -408,12 +384,8 @@ OpenCLRuntime::OpenCLRuntime(): return; } - extern std::shared_ptr kStorageFactory; std::string cached_binary_platform_info; - if (kStorageFactory != nullptr) { - cache_storage_ = - kStorageFactory->CreateStorage(kPrecompiledProgramFileName); - + if (cache_storage_ != nullptr) { if (cache_storage_->Load() != 0) { LOG(WARNING) << "Load OpenCL cached compiled kernel file failed. " << "Please make sure the storage directory exist " @@ -432,9 +404,10 @@ OpenCLRuntime::OpenCLRuntime(): } if (cached_binary_platform_info != platform_info_) { - if (!OpenCLRuntime::kPrecompiledBinaryPath.empty()) { - precompiled_binary_storage_.reset( - new FileStorage(OpenCLRuntime::kPrecompiledBinaryPath)); + if (precompiled_binary_storage_ == nullptr) { + VLOG(1) << "There is no precompiled OpenCL binary in" + " all OpenCL binary paths."; + } else { if (precompiled_binary_storage_->Load() != 0) { LOG(WARNING) << "Load OpenCL precompiled kernel file failed. " << "Please make sure the storage directory exist " @@ -487,6 +460,8 @@ cl::Device &OpenCLRuntime::device() { return *device_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } +Tuner *OpenCLRuntime::tuner() { return tuner_; } + uint64_t OpenCLRuntime::device_global_mem_cache_size() const { return device_gloabl_mem_cache_size_; } diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 537707fa..222fe851 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -22,11 +22,12 @@ #include #include +#include "mace/core/file_storage.h" #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/string_util.h" #include "mace/utils/timer.h" +#include "mace/utils/tuner.h" namespace mace { @@ -60,29 +61,17 @@ const std::string OpenCLErrorToString(cl_int error); return MaceStatus::MACE_OUT_OF_RESOURCES; \ } -class OpenCLProfilingTimer : public Timer { - public: - explicit OpenCLProfilingTimer(const cl::Event *event) - : event_(event), accumulated_micros_(0) {} - void StartTiming() override; - void StopTiming() override; - void AccumulateTiming() override; - void ClearTiming() override; - double ElapsedMicros() override; - double AccumulatedMicros() override; - - private: - const cl::Event *event_; - double start_nanos_; - double stop_nanos_; - double accumulated_micros_; -}; - class OpenCLRuntime { public: - static OpenCLRuntime *Global(); - static void Configure(GPUPerfHint, GPUPriorityHint); - static void ConfigureOpenCLBinaryPath(const std::vector &paths); + OpenCLRuntime( + KVStorage *cache_storage = nullptr, + const GPUPriorityHint priority_hint = GPUPriorityHint::PRIORITY_NORMAL, + const GPUPerfHint perf_hint = GPUPerfHint::PERF_NORMAL, + KVStorage *precompiled_binary_storage = nullptr, + Tuner *tuner = nullptr); + ~OpenCLRuntime(); + OpenCLRuntime(const OpenCLRuntime &) = delete; + OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; cl::Context &context(); cl::Device &device(); @@ -91,6 +80,7 @@ class OpenCLRuntime { const std::string platform_info() const; uint64_t device_global_mem_cache_size() const; uint32_t device_compute_units() const; + Tuner *tuner(); bool is_opencl_avaliable(); void GetCallStats(const cl::Event &event, CallStats *stats); @@ -112,11 +102,6 @@ class OpenCLRuntime { void SaveBuiltCLProgram(); private: - OpenCLRuntime(); - ~OpenCLRuntime(); - OpenCLRuntime(const OpenCLRuntime &) = delete; - OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; - bool BuildProgram(const std::string &program_file_name, const std::string &binary_file_name, const std::string &build_options, @@ -137,10 +122,13 @@ class OpenCLRuntime { OpenCLVersion ParseDeviceVersion(const std::string &device_version); private: - std::unique_ptr precompiled_binary_storage_; - std::unique_ptr cache_storage_; + KVStorage *cache_storage_; + KVStorage *precompiled_binary_storage_; + Tuner *tuner_; bool is_opencl_avaliable_; bool is_profiling_enabled_; + OpenCLVersion opencl_version_; + GPUType gpu_type_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. std::shared_ptr context_; @@ -149,18 +137,30 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string platform_info_; - OpenCLVersion opencl_version_; std::string precompiled_binary_platform_info_; bool out_of_range_check_; uint64_t device_gloabl_mem_cache_size_; uint32_t device_compute_units_; - GPUType gpu_type_; - - static GPUPerfHint kGPUPerfHint; - static GPUPriorityHint kGPUPriorityHint; - static std::string kPrecompiledBinaryPath; }; +class OpenCLProfilingTimer : public Timer { + public: + OpenCLProfilingTimer(OpenCLRuntime *runtime, const cl::Event *event) + : runtime_(runtime), event_(event), accumulated_micros_(0) {} + void StartTiming() override; + void StopTiming() override; + void AccumulateTiming() override; + void ClearTiming() override; + double ElapsedMicros() override; + double AccumulatedMicros() override; + + private: + OpenCLRuntime *runtime_; + const cl::Event *event_; + double start_nanos_; + double stop_nanos_; + double accumulated_micros_; +}; } // namespace mace #endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 62ea5488..f7e50987 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -25,7 +25,6 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" #endif -#include "mace/public/mace.h" #include "mace/utils/logging.h" #ifdef MACE_ENABLE_NEON @@ -38,10 +37,10 @@ namespace mace { #define MACE_SINGLE_ARG(...) __VA_ARGS__ -#define MACE_CASE(TYPE, STATEMENTS) \ +#define MACE_CASE(TYPE, STATEMENTS) \ case DataTypeToEnum::value: { \ typedef TYPE T; \ - STATEMENTS; \ + STATEMENTS; \ break; \ } @@ -137,7 +136,7 @@ class Tensor { buffer_ = &buffer_slice_; } - Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {} + Tensor() : Tensor(GetCPUAllocator(), DT_FLOAT) {} ~Tensor() { if (is_buffer_owner_ && buffer_ != nullptr) { @@ -270,7 +269,7 @@ class Tensor { image_shape_ = image_shape; if (buffer_ == nullptr) { MACE_CHECK(is_buffer_owner_); - buffer_ = new Image(); + buffer_ = new Image(allocator_); return buffer_->Allocate(image_shape, dtype_); } else { MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize."); diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index 48a6928d..569a8345 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -16,15 +16,10 @@ #include "gflags/gflags.h" #include "mace/core/runtime/cpu/cpu_runtime.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/logging.h" DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); -DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(cpu_affinity_policy, 1, "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); @@ -43,10 +38,6 @@ int main(int argc, char **argv) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } - mace::OpenCLRuntime::Configure( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - mace::testing::Benchmark::Run(FLAGS_filter.c_str()); return 0; } diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 07d85560..4c9204cb 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/workspace.h" + +#include #include #include #include @@ -21,8 +24,6 @@ #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/opencl_runtime.h" #endif -#include "mace/core/workspace.h" -#include "mace/utils/timer.h" namespace mace { @@ -35,8 +36,8 @@ bool ShouldPreallocateMemoryForOp(const OperatorDef &op) { } } // namespace -Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer( - GetDeviceAllocator(DeviceType::CPU))) {} +Workspace::Workspace() : + host_scratch_buffer_(new ScratchBuffer(GetCPUAllocator())) {} Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, @@ -74,7 +75,7 @@ std::vector Workspace::Tensors() const { } MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, - DeviceType type, + Device *device, const unsigned char *model_data) { MACE_LATENCY_LOGGER(1, "Load model tensors"); index_t model_data_size = 0; @@ -87,10 +88,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } VLOG(3) << "Model data size: " << model_data_size; + const DeviceType device_type = device->device_type(); + if (model_data_size > 0) { #ifdef MACE_ENABLE_OPENCL - if (type == DeviceType::GPU && - OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <= + if (device_type == DeviceType::GPU && + device->opencl_runtime()->GetDeviceMaxMemAllocSize() <= static_cast(model_data_size)) { for (auto &const_tensor : net_def.tensors()) { MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); @@ -104,7 +107,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } std::unique_ptr tensor( - new Tensor(GetDeviceAllocator(type), + new Tensor(device->allocator(), const_tensor.data_type(), true)); tensor->Resize(dims); @@ -129,14 +132,14 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, #else { #endif - if (type == DeviceType::CPU) { + if (device_type == DeviceType::CPU) { tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type), + new Buffer(device->allocator(), const_cast(model_data), model_data_size)); } else { tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type))); + new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); tensor_buffer_->Map(nullptr); tensor_buffer_->Copy(const_cast(model_data), @@ -170,12 +173,12 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } } - if (type == DeviceType::CPU || type == DeviceType::GPU) { - MaceStatus status = CreateOutputTensorBuffer(net_def, type); + if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) { + MaceStatus status = CreateOutputTensorBuffer(net_def, device); if (status != MaceStatus::MACE_SUCCESS) return status; } - if (type == DeviceType::CPU && net_def.has_quantize_info()) { + if (device_type == DeviceType::CPU && net_def.has_quantize_info()) { for (const auto &activation_info: net_def.quantize_info().activation_info()) { if (HasTensor(activation_info.tensor_name())) { @@ -193,7 +196,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, - DeviceType device_type) { + Device *device) { + DeviceType device_type = device->device_type(); DataType dtype = DataType::DT_INVALID; if (net_def.mem_arena().mem_block_size() > 0) { // We use the data type of the first op with mem id, @@ -227,7 +231,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, << ", memory type: " << mem_block.mem_type(); if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::CPU))); + new Buffer(GetCPUAllocator())); MACE_RETURN_IF_ERROR(tensor_buf->Allocate( mem_block.x() * GetEnumTypeSize(dtype) + MACE_EXTRA_BUFFER_PAD_SIZE)); @@ -235,14 +239,14 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, std::move(tensor_buf)); } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { std::unique_ptr image_buf( - new Image()); + new Image(device->allocator())); MACE_RETURN_IF_ERROR(image_buf->Allocate( {mem_block.x(), mem_block.y()}, dtype)); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { std::unique_ptr tensor_buf( - new Buffer(GetDeviceAllocator(DeviceType::GPU))); + new Buffer(device->allocator())); MACE_RETURN_IF_ERROR(tensor_buf->Allocate( mem_block.x() * GetEnumTypeSize(dtype))); preallocated_allocator_.SetBuffer(mem_block.mem_id(), @@ -305,7 +309,7 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, op, "T", static_cast(DT_FLOAT))); } CreateTensor(op.output(i), - GetDeviceAllocator(device_type), + device->allocator(), output_type); } } @@ -335,7 +339,8 @@ void Workspace::RemoveUnusedBuffer() { } void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, - const unsigned char *model_data) { + const unsigned char *model_data, + Allocator *alloc) { for (auto &const_tensor : net_def.tensors()) { auto iter = tensor_map_.find(const_tensor.name()); if (iter->second->unused()) { @@ -347,8 +352,7 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, dims.push_back(d); } std::unique_ptr tensor( - new Tensor(GetDeviceAllocator(DeviceType::GPU), - const_tensor.data_type())); + new Tensor(alloc, const_tensor.data_type())); tensor->Resize(dims); MACE_CHECK(tensor->size() == const_tensor.data_size(), "Tensor's data_size not equal with the shape"); diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 20f214b0..71850098 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -20,6 +20,7 @@ #include #include +#include "mace/core/device.h" #include "mace/core/preallocated_pooled_allocator.h" #include "mace/core/tensor.h" #include "mace/public/mace.h" @@ -48,7 +49,7 @@ class Workspace { std::vector Tensors() const; MaceStatus LoadModelTensor(const NetDef &net_def, - DeviceType type, + Device *device, const unsigned char *model_data); ScratchBuffer *GetScratchBuffer(DeviceType device_type); @@ -56,11 +57,14 @@ class Workspace { void RemoveUnusedBuffer(); void RemoveAndReloadBuffer(const NetDef &net_def, - const unsigned char *model_data); + const unsigned char *model_data, + Allocator *alloc); private: MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, - DeviceType device_type); + Device *device); + + Device *device_; TensorMap tensor_map_; diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java index edd7bf18..5788801c 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/AppModel.java @@ -37,15 +37,13 @@ public class AppModel { mJniThread = new Handler(thread.getLooper()); } - public void maceMobilenetSetAttrs(final InitData initData) { + public void maceMobilenetCreateGPUContext(final InitData initData) { mJniThread.post(new Runnable() { @Override public void run() { - int result = JniMaceUtils.maceMobilenetSetAttrs( - initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(), - initData.getGpuPerfHint(), initData.getGpuPriorityHint(), - initData.getKernelPath()); - Log.i("APPModel", "maceMobilenetSetAttrs result = " + result); + int result = JniMaceUtils.maceMobilenetCreateGPUContext( + initData.getStoragePath()); + Log.i("APPModel", "maceMobilenetCreateGPUContext result = " + result); } }); } @@ -54,7 +52,10 @@ public class AppModel { mJniThread.post(new Runnable() { @Override public void run() { - int result = JniMaceUtils.maceMobilenetCreateEngine(initData.getModel(), initData.getDevice()); + int result = JniMaceUtils.maceMobilenetCreateEngine( + initData.getOmpNumThreads(), initData.getCpuAffinityPolicy(), + initData.getGpuPerfHint(), initData.getGpuPriorityHint(), + initData.getModel(), initData.getDevice()); Log.i("APPModel", "maceMobilenetCreateEngine result = " + result); if (result == -1) { diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java index ab62a90f..f8adafc8 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/CameraActivity.java @@ -139,7 +139,7 @@ public class CameraActivity extends Activity implements View.OnClickListener, Ap } private void initJni() { - AppModel.instance.maceMobilenetSetAttrs(initData); + AppModel.instance.maceMobilenetCreateGPUContext(initData); AppModel.instance.maceMobilenetCreateEngine(initData, this); } diff --git a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java index ab0f54b5..ffcbde96 100644 --- a/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java +++ b/mace/examples/android/app/src/main/java/com/xiaomi/mace/demo/result/InitData.java @@ -29,7 +29,7 @@ public class InitData { private int cpuAffinityPolicy; private int gpuPerfHint; private int gpuPriorityHint; - private String kernelPath = ""; + private String storagePath = ""; public InitData() { model = MODELS[0]; @@ -38,8 +38,8 @@ public class InitData { gpuPerfHint = 3; gpuPriorityHint = 3; device = DEVICES[0]; - kernelPath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace"; - File file = new File(kernelPath); + storagePath = Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "mace"; + File file = new File(storagePath); if (!file.exists()) { file.mkdir(); } @@ -94,11 +94,11 @@ public class InitData { this.gpuPriorityHint = gpuPriorityHint; } - public String getKernelPath() { - return kernelPath; + public String getStoragePath() { + return storagePath; } - public void setKernelPath(String kernelPath) { - this.kernelPath = kernelPath; + public void setStoragePath(String storagePath) { + this.storagePath = storagePath; } } diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc index 0a070270..4ccba56e 100755 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc @@ -26,7 +26,6 @@ #include #include "src/main/cpp/include/mace/public/mace.h" -#include "src/main/cpp/include/mace/public/mace_runtime.h" #include "src/main/cpp/include/mace/public/mace_engine_factory.h" namespace { @@ -39,8 +38,8 @@ struct ModelInfo { }; struct MaceContext { + std::shared_ptr gpu_context; std::shared_ptr engine; - std::shared_ptr storage_factory; std::string model_name; mace::DeviceType device_type = mace::DeviceType::CPU; std::map model_infos = { @@ -72,48 +71,65 @@ MaceContext& GetMaceContext() { } // namespace -JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs( - JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, - jint gpu_perf_hint, jint gpu_priority_hint, jstring kernel_path) { +JNIEXPORT jint JNICALL +Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext( + JNIEnv *env, jclass thisObj, jstring storage_path) { MaceContext &mace_context = GetMaceContext(); + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = env->GetStringUTFChars(storage_path, nullptr); + if (storage_path_ptr == nullptr) return JNI_ERR; + const std::string storage_file_path(storage_path_ptr); + env->ReleaseStringUTFChars(storage_path, storage_path_ptr); - mace::MaceStatus status; - // openmp - status = mace::SetOpenMPThreadPolicy( - omp_num_threads, - static_cast(cpu_affinity_policy)); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "openmp result: %d, threads: %d, cpu: %d", - status, omp_num_threads, cpu_affinity_policy); - - // gpu - mace::SetGPUHints( - static_cast(gpu_perf_hint), - static_cast(gpu_priority_hint)); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "gpu perf: %d, priority: %d", - gpu_perf_hint, gpu_priority_hint); - - // opencl cache - const char *kernel_path_ptr = env->GetStringUTFChars(kernel_path, nullptr); - if (kernel_path_ptr == nullptr) return JNI_ERR; - const std::string kernel_file_path(kernel_path_ptr); - mace_context.storage_factory.reset( - new mace::FileStorageFactory(kernel_file_path)); - mace::SetKVStorageFactory(mace_context.storage_factory); - env->ReleaseStringUTFChars(kernel_path, kernel_path_ptr); + mace_context.gpu_context = mace::GPUContextBuilder() + .SetStoragePath(storage_file_path) + .Finalize(); return JNI_OK; } JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( - JNIEnv *env, jclass thisObj, jstring model_name_str, jstring device) { + JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, + jint gpu_perf_hint, jint gpu_priority_hint, + jstring model_name_str, jstring device) { MaceContext &mace_context = GetMaceContext(); + + // get device + const char *device_ptr = env->GetStringUTFChars(device, nullptr); + if (device_ptr == nullptr) return JNI_ERR; + mace_context.device_type = ParseDeviceType(device_ptr); + env->ReleaseStringUTFChars(device, device_ptr); + + // create MaceEngineConfig + mace::MaceStatus status; + mace::MaceEngineConfig config(mace_context.device_type); + status = config.SetCPUThreadPolicy( + omp_num_threads, + static_cast(cpu_affinity_policy)); + if (status != mace::MACE_SUCCESS) { + __android_log_print(ANDROID_LOG_ERROR, + "image_classify attrs", + "openmp result: %d, threads: %d, cpu: %d", + status, omp_num_threads, cpu_affinity_policy); + } + if (mace_context.device_type == mace::DeviceType::GPU) { + config.SetGPUContext(mace_context.gpu_context); + config.SetGPUHints( + static_cast(gpu_perf_hint), + static_cast(gpu_priority_hint)); + __android_log_print(ANDROID_LOG_INFO, + "image_classify attrs", + "gpu perf: %d, priority: %d", + gpu_perf_hint, gpu_priority_hint); + } + + __android_log_print(ANDROID_LOG_INFO, + "image_classify attrs", + "device: %d", + mace_context.device_type); + // parse model name const char *model_name_ptr = env->GetStringUTFChars(model_name_str, nullptr); if (model_name_ptr == nullptr) return JNI_ERR; @@ -133,26 +149,15 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( std::vector input_names = {model_info_iter->second.input_name}; std::vector output_names = {model_info_iter->second.output_name}; - // get device - const char *device_ptr = env->GetStringUTFChars(device, nullptr); - if (device_ptr == nullptr) return JNI_ERR; - mace_context.device_type = ParseDeviceType(device_ptr); - env->ReleaseStringUTFChars(device, device_ptr); - - __android_log_print(ANDROID_LOG_ERROR, - "image_classify attrs", - "device: %d", - mace_context.device_type); - mace::MaceStatus create_engine_status = CreateMaceEngineFromCode(mace_context.model_name, std::string(), input_names, output_names, - mace_context.device_type, + config, &mace_context.engine); - __android_log_print(ANDROID_LOG_ERROR, + __android_log_print(ANDROID_LOG_INFO, "image_classify attrs", "create result: %d", create_engine_status); diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h index bef7417b..5114eb91 100644 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.h +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.h @@ -24,11 +24,13 @@ extern "C" { #endif /* * Class: com_xiaomi_mace_JniMaceUtils - * Method: maceMobilenetSetAttrs + * Method: maceMobilenetCreateGPUContext * Signature: (Ljava/lang/String;IIIILjava/lang/String;)I */ -JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs - (JNIEnv *, jclass, jint, jint, jint, jint, jstring); +JNIEXPORT jint JNICALL +Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(JNIEnv *, + jclass, + jstring); /* * Class: com_xiaomi_mace_JniMaceUtils @@ -37,7 +39,7 @@ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetSetAttrs */ JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine - (JNIEnv *, jclass, jstring, jstring); + (JNIEnv *, jclass, jint, jint, jint, jint, jstring, jstring); /* * Class: com_xiaomi_mace_JniMaceUtils diff --git a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java index f9ab7a7a..e776c013 100644 --- a/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java +++ b/mace/examples/android/macelibrary/src/main/java/com/xiaomi/mace/JniMaceUtils.java @@ -20,9 +20,9 @@ public class JniMaceUtils { System.loadLibrary("mace_mobile_jni"); } - public static native int maceMobilenetSetAttrs(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String kernelPath); + public static native int maceMobilenetCreateGPUContext(String storagePath); - public static native int maceMobilenetCreateEngine(String model, String device); + public static native int maceMobilenetCreateEngine(int ompNumThreads, int cpuAffinityPolicy, int gpuPerfHint, int gpuPriorityHint, String model, String device); public static native float[] maceMobilenetClassify(float[] input); diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 4892baf2..99436fa4 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -21,7 +21,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" // if convert model to code. #ifdef MODEL_GRAPH_FORMAT_CODE #include "mace/codegen/engine/mace_engine_factory.h" @@ -157,40 +156,40 @@ bool RunModel(const std::vector &input_names, const std::vector> &output_shapes) { // load model DeviceType device_type = ParseDeviceType(FLAGS_device); - // config runtime - mace::SetOpenMPThreadPolicy( + // configuration + // Detailed information please see mace.h + MaceStatus status; + MaceEngineConfig config(device_type); + status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); + if (status != MACE_SUCCESS) { + std::cerr << "Set openmp or cpu affinity failed." << std::endl; + } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - - // Just call once. (Not thread-safe) - // Set paths of Generated OpenCL Compiled Kernel Binary file - // if you build gpu library of specific soc. - // Using OpenCL binary will speed up the initialization. - // OpenCL binary is corresponding to the OpenCL Driver version, - // you should update the binary when OpenCL Driver changed. + // DO NOT USE tmp directory. + // Please use APP's own directory and make sure the directory exists. + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - // DO NOT USE tmp directory. - // Please use APP's own directory and make sure the directory exists. - // Just call once - const std::string internal_storage_path = - "/data/local/tmp/mace_run/interior"; - - // Config internal kv storage factory. - std::shared_ptr storage_factory( - new FileStorageFactory(internal_storage_path)); - SetKVStorageFactory(storage_factory); - // Create Engine std::shared_ptr engine; MaceStatus create_engine_status; @@ -204,7 +203,7 @@ bool RunModel(const std::vector &input_names, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else std::vector model_pb_data; @@ -216,7 +215,7 @@ bool RunModel(const std::vector &input_names, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 51383ad4..3159684d 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -126,10 +127,14 @@ template class ActivationFunctor; template <> -class ActivationFunctor { +class ActivationFunctor : OpKernel { public: - ActivationFunctor(ActivationType type, float relux_max_limit) - : activation_(type), relux_max_limit_(relux_max_limit) {} + ActivationFunctor(OpKernelContext *context, + ActivationType type, + float relux_max_limit) + : OpKernel(context), + activation_(type), + relux_max_limit_(relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *alpha, @@ -159,10 +164,14 @@ class ActivationFunctor { #ifdef MACE_ENABLE_OPENCL template -class ActivationFunctor { +class ActivationFunctor : OpKernel { public: - ActivationFunctor(ActivationType type, T relux_max_limit) - : activation_(type), relux_max_limit_(static_cast(relux_max_limit)) {} + ActivationFunctor(OpKernelContext *context, + ActivationType type, + T relux_max_limit) + : OpKernel(context), + activation_(type), + relux_max_limit_(static_cast(relux_max_limit)) {} MaceStatus operator()(const Tensor *input, const Tensor *alpha, diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 2215343f..d81f25a3 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -24,6 +24,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -35,10 +36,11 @@ namespace kernels { constexpr int kCostPerGroup = 1024; template -struct AddNFunctor { +struct AddNFunctor : OpKernel { + explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) { + Tensor *output_tensor, + StatsFuture *future) { MACE_UNUSED(future); MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0])); index_t size = output_tensor->size(); @@ -95,7 +97,8 @@ struct AddNFunctor { #ifdef MACE_ENABLE_OPENCL template -struct AddNFunctor { +struct AddNFunctor : OpKernel { + explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const std::vector &input_tensors, Tensor *output_tensor, StatsFuture *future); diff --git a/mace/kernels/argmax.h b/mace/kernels/argmax.h index 54edc3ee..36218d62 100644 --- a/mace/kernels/argmax.h +++ b/mace/kernels/argmax.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" @@ -30,7 +31,8 @@ namespace mace { namespace kernels { template -struct ArgMaxFunctor { +struct ArgMaxFunctor : OpKernel { + explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const Tensor *axis, Tensor *output, diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc index 166b67a5..13135432 100644 --- a/mace/kernels/arm/conv_winograd_test.cc +++ b/mace/kernels/arm/conv_winograd_test.cc @@ -37,10 +37,10 @@ TEST(ConvWinogradTest, winograd) { index_t filter_size = 3 * 3 * in_channels * out_channels; index_t output_size = batch * out_channels * out_height * out_width; - Tensor input; - Tensor filter; - Tensor output; - Tensor output_ref; + Tensor input(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor filter(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor output(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor output_ref(GetCPUAllocator(), DataType::DT_FLOAT); input.Resize({batch, in_channels, in_height, in_width}); filter.Resize({out_channels, in_channels, 3, 3}); diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 6f934e6b..4c9aac3a 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -33,11 +33,13 @@ namespace mace { namespace kernels { -struct BatchNormFunctorBase { - BatchNormFunctorBase(bool folded_constant, +struct BatchNormFunctorBase : OpKernel { + BatchNormFunctorBase(OpKernelContext *context, + bool folded_constant, const ActivationType activation, const float relux_max_limit) - : folded_constant_(folded_constant), + : OpKernel(context), + folded_constant_(folded_constant), activation_(activation), relux_max_limit_(relux_max_limit) {} @@ -51,10 +53,14 @@ struct BatchNormFunctor; template<> struct BatchNormFunctor : BatchNormFunctorBase { - BatchNormFunctor(const bool folded_constant, + BatchNormFunctor(OpKernelContext *context, + const bool folded_constant, const ActivationType activation, const float relux_max_limit) - : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} + : BatchNormFunctorBase(context, + folded_constant, + activation, + relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *scale, @@ -132,10 +138,14 @@ struct BatchNormFunctor : BatchNormFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct BatchNormFunctor : BatchNormFunctorBase { - BatchNormFunctor(const bool folded_constant, + BatchNormFunctor(OpKernelContext *context, + const bool folded_constant, const ActivationType activation, const float relux_max_limit) - : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} + : BatchNormFunctorBase(context, + folded_constant, + activation, + relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index 1cd8421c..e2ea8ccf 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,10 +31,10 @@ namespace mace { namespace kernels { -struct BiasAddFunctorBase { - explicit BiasAddFunctorBase(const DataFormat data_format) { - data_format_ = data_format; - } +struct BiasAddFunctorBase : OpKernel { + BiasAddFunctorBase(OpKernelContext *context, + const DataFormat data_format) + : OpKernel(context), data_format_(data_format) {} DataFormat data_format_; }; @@ -43,8 +44,9 @@ struct BiasAddFunctor; template <> struct BiasAddFunctor : BiasAddFunctorBase { - explicit BiasAddFunctor(const DataFormat data_format) - : BiasAddFunctorBase(data_format) {} + BiasAddFunctor(OpKernelContext *context, + const DataFormat data_format) + : BiasAddFunctorBase(context, data_format) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, @@ -96,8 +98,8 @@ struct BiasAddFunctor : BiasAddFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct BiasAddFunctor : BiasAddFunctorBase { - explicit BiasAddFunctor(const DataFormat data_format) - : BiasAddFunctorBase(data_format) {} + BiasAddFunctor(OpKernelContext *context, const DataFormat data_format) + : BiasAddFunctorBase(context, data_format) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, Tensor *output, diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index 1def9087..4a2f731b 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -20,21 +20,24 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { -struct BufferToImageFunctorBase { - explicit BufferToImageFunctorBase(const int wino_blk_size) - : wino_blk_size_(wino_blk_size) {} +struct BufferToImageFunctorBase : OpKernel { + explicit BufferToImageFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(const int wino_blk_size) - : BufferToImageFunctorBase(wino_blk_size) {} + explicit BufferToImageFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferToImageFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -50,8 +53,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { template struct BufferToImageFunctor : BufferToImageFunctorBase { - explicit BufferToImageFunctor(const int wino_blk_size) - : BufferToImageFunctorBase(wino_blk_size) {} + explicit BufferToImageFunctor(OpKernelContext *context, + const int wino_blk_size) + : BufferToImageFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index 920e1e1a..029eb1c6 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -20,13 +20,15 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { template -struct ChannelShuffleFunctor { - explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} +struct ChannelShuffleFunctor : OpKernel { + ChannelShuffleFunctor(OpKernelContext *context, const int groups) + : OpKernel(context), groups_(groups) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -70,8 +72,9 @@ struct ChannelShuffleFunctor { #ifdef MACE_ENABLE_OPENCL template -struct ChannelShuffleFunctor { - explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {} +struct ChannelShuffleFunctor : OpKernel { + ChannelShuffleFunctor(OpKernelContext *context, const int groups) + : OpKernel(context), groups_(groups) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index 1728ca08..696d4ff0 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,15 +31,17 @@ namespace mace { namespace kernels { -struct ConcatFunctorBase { - explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {} +struct ConcatFunctorBase : OpKernel { + ConcatFunctorBase(OpKernelContext *context, const int32_t axis) + : OpKernel(context), axis_(axis) {} int32_t axis_; }; template struct ConcatFunctor : ConcatFunctorBase { - explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} + ConcatFunctor(OpKernelContext *context, const int32_t axis) + : ConcatFunctorBase(context, axis) {} MaceStatus operator()(const std::vector &input_list, Tensor *output, @@ -97,7 +100,8 @@ struct ConcatFunctor : ConcatFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct ConcatFunctor : ConcatFunctorBase { - explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} + ConcatFunctor(OpKernelContext *context, const int32_t axis) + : ConcatFunctorBase(context, axis) {} MaceStatus operator()(const std::vector &input_list, Tensor *output, diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 282472bc..ce9bb11d 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -42,14 +42,16 @@ namespace mace { namespace kernels { -struct Conv2dFunctorBase { - Conv2dFunctorBase(const int *strides, +struct Conv2dFunctorBase : OpKernel { + Conv2dFunctorBase(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), dilations_(dilations), @@ -69,7 +71,8 @@ struct Conv2dFunctor; template<> struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -77,12 +80,14 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, activation, relux_max_limit), + transformed_filter_(GetCPUAllocator(), DataType::DT_FLOAT), is_filter_transformed_(is_filter_transformed), scratch_(scratch) {} @@ -721,7 +726,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { template<> struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -729,7 +735,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -949,7 +956,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(const int *strides, + Conv2dFunctor(OpKernelContext *context, + const int *strides, const Padding &padding_type, const std::vector &paddings, const int *dilations, @@ -957,7 +965,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const float relux_max_limit, const bool is_filter_transformed, ScratchBuffer *scratch) - : Conv2dFunctorBase(strides, + : Conv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -968,10 +977,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { } MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; diff --git a/mace/kernels/crop.h b/mace/kernels/crop.h index 241584e8..6ad9650e 100644 --- a/mace/kernels/crop.h +++ b/mace/kernels/crop.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -30,10 +31,12 @@ namespace mace { namespace kernels { -struct CropFunctorBase { - CropFunctorBase(const int axis, +struct CropFunctorBase : OpKernel { + CropFunctorBase(OpKernelContext *context, + const int axis, const std::vector &offset) - : axis_(axis), + : OpKernel(context), + axis_(axis), offset_(offset) {} const int axis_; @@ -42,8 +45,10 @@ struct CropFunctorBase { template struct CropFunctor : CropFunctorBase { - CropFunctor(const int axis, const std::vector &offset) - : CropFunctorBase(axis, offset) {} + CropFunctor(OpKernelContext *context, + const int axis, + const std::vector &offset) + : CropFunctorBase(context, axis, offset) {} void crop_copy(const T* input_data, T* output_data, const std::vector &input_shape, @@ -121,12 +126,14 @@ struct CropFunctor : CropFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct CropFunctor : CropFunctorBase { - CropFunctor(const int axis, const std::vector &offset) - : CropFunctorBase(axis, offset) {} + CropFunctor(OpKernelContext *context, + const int axis, + const std::vector &offset) + : CropFunctorBase(context, axis, offset) {} MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future); + Tensor *output, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 9450104d..4bfc4d61 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -89,14 +89,16 @@ void Deconv2dNCHW(const T *input, } } // namespace deconv -struct Deconv2dFunctorBase { - Deconv2dFunctorBase(const std::vector &strides, +struct Deconv2dFunctorBase : OpKernel { + Deconv2dFunctorBase(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), output_shape_(output_shape), @@ -210,13 +212,15 @@ struct Deconv2dFunctorBase { template struct Deconv2dFunctor : Deconv2dFunctorBase { - Deconv2dFunctor(const std::vector &strides, + Deconv2dFunctor(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : Deconv2dFunctorBase(strides, + : Deconv2dFunctorBase(context, + strides, padding_type, paddings, output_shape, @@ -315,13 +319,15 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct Deconv2dFunctor : Deconv2dFunctorBase { - Deconv2dFunctor(const std::vector &strides, + Deconv2dFunctor(OpKernelContext *context, + const std::vector &strides, const Padding &padding_type, const std::vector &paddings, const std::vector &output_shape, const ActivationType activation, const float relux_max_limit) - : Deconv2dFunctorBase(strides, + : Deconv2dFunctorBase(context, + strides, padding_type, paddings, output_shape, diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index c0e0f267..7c4a7456 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -19,6 +19,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -29,9 +30,11 @@ namespace mace { namespace kernels { template -struct DepthToSpaceOpFunctor { - explicit DepthToSpaceOpFunctor(const int block_size, bool d2s) - : block_size_(block_size), d2s_(d2s) {} +struct DepthToSpaceOpFunctor : OpKernel { + DepthToSpaceOpFunctor(OpKernelContext *context, + const int block_size, + bool d2s) + : OpKernel(context), block_size_(block_size), d2s_(d2s) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -123,9 +126,11 @@ struct DepthToSpaceOpFunctor { #ifdef MACE_ENABLE_OPENCL template -struct DepthToSpaceOpFunctor { - DepthToSpaceOpFunctor(const int block_size, bool d2s) - : block_size_(block_size), d2s_(d2s) {} +struct DepthToSpaceOpFunctor : OpKernel { + DepthToSpaceOpFunctor(OpKernelContext *context, + const int block_size, + bool d2s) + : OpKernel(context), block_size_(block_size), d2s_(d2s) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 9304b14f..3b2eb70b 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -37,14 +37,16 @@ namespace mace { namespace kernels { -struct DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctorBase(const int *strides, +struct DepthwiseConv2dFunctorBase : OpKernel { + DepthwiseConv2dFunctorBase(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : strides_(strides), + : OpKernel(context), + strides_(strides), padding_type_(padding_type), paddings_(paddings), dilations_(dilations), @@ -65,13 +67,15 @@ struct DepthwiseConv2dFunctor; template<> struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -288,13 +292,15 @@ struct DepthwiseConv2dFunctor template<> struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, @@ -451,7 +457,7 @@ struct DepthwiseConv2dFunctor const int32_t *bias_data = nullptr; if (bias == nullptr) { zero_bias.reset( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); + new Tensor(GetCPUAllocator(), DT_INT32)); zero_bias->Resize(bias_shape); zero_bias->Clear(); bias_data = zero_bias->data(); @@ -495,13 +501,15 @@ struct DepthwiseConv2dFunctor template struct DepthwiseConv2dFunctor : DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(const int *strides, + DepthwiseConv2dFunctor(OpKernelContext *context, + const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations, const ActivationType activation, const float relux_max_limit) - : DepthwiseConv2dFunctorBase(strides, + : DepthwiseConv2dFunctorBase(context, + strides, padding_type, paddings, dilations, diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 42d220fa..9e9a2be8 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -802,13 +803,15 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } -struct EltwiseFunctorBase { - EltwiseFunctorBase(const EltwiseType type, +struct EltwiseFunctorBase : OpKernel { + EltwiseFunctorBase(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, const int32_t scalar_input_index, const DataFormat data_format) - : type_(type), + : OpKernel(context), + type_(type), coeff_(coeff), scalar_input_(scalar_input), scalar_input_index_(scalar_input_index), @@ -823,12 +826,14 @@ struct EltwiseFunctorBase { template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, + EltwiseFunctor(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, // float as it comes from arg const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(type, + : EltwiseFunctorBase(context, + type, coeff, scalar_input, scalar_input_index, @@ -956,12 +961,14 @@ struct EltwiseFunctor : EltwiseFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct EltwiseFunctor : EltwiseFunctorBase { - EltwiseFunctor(const EltwiseType type, + EltwiseFunctor(OpKernelContext *context, + const EltwiseType type, const std::vector &coeff, const float scalar_input, const int32_t scalar_input_index, const DataFormat data_format) - : EltwiseFunctorBase(type, + : EltwiseFunctorBase(context, + type, coeff, scalar_input, scalar_input_index, diff --git a/mace/kernels/fill.h b/mace/kernels/fill.h index b534a183..131dd9d4 100644 --- a/mace/kernels/fill.h +++ b/mace/kernels/fill.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { @@ -30,8 +31,8 @@ template struct FillFunctor; template <> -struct FillFunctor { - FillFunctor() {} +struct FillFunctor : OpKernel { + explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *shape, const Tensor *value, diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index e5172920..e6743aa4 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -27,10 +27,12 @@ namespace mace { namespace kernels { -struct FullyConnectedBase { - FullyConnectedBase(const ActivationType activation, +struct FullyConnectedBase : OpKernel { + FullyConnectedBase(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : activation_(activation), + : OpKernel(context), + activation_(activation), relux_max_limit_(relux_max_limit) {} const ActivationType activation_; @@ -42,9 +44,10 @@ struct FullyConnectedFunctor; template <> struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, @@ -86,9 +89,10 @@ struct FullyConnectedFunctor: FullyConnectedBase { template <> struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, @@ -117,7 +121,7 @@ struct FullyConnectedFunctor: FullyConnectedBase { const int32_t *bias_ptr = nullptr; if (bias == nullptr) { zero_bias.reset( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); + new Tensor(GetCPUAllocator(), DT_INT32)); zero_bias->Resize(bias_shape); zero_bias->Clear(); bias_ptr = zero_bias->data(); @@ -148,9 +152,10 @@ struct FullyConnectedFunctor: FullyConnectedBase { #ifdef MACE_ENABLE_OPENCL template struct FullyConnectedFunctor : FullyConnectedBase { - FullyConnectedFunctor(const ActivationType activation, + FullyConnectedFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit) - : FullyConnectedBase(activation, relux_max_limit) {} + : FullyConnectedBase(context, activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, const Tensor *weight, diff --git a/mace/kernels/gather.h b/mace/kernels/gather.h index 101a60e3..ddfa14d1 100644 --- a/mace/kernels/gather.h +++ b/mace/kernels/gather.h @@ -21,13 +21,15 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { -struct GatherBase { - explicit GatherBase(int axis, float y) : axis_(axis), y_(y) {} +struct GatherBase : OpKernel { + GatherBase(OpKernelContext *context, int axis, float y) + : OpKernel(context), axis_(axis), y_(y) {} int axis_; float y_; @@ -38,7 +40,8 @@ struct GatherFunctor; template <> struct GatherFunctor : GatherBase { - explicit GatherFunctor(int axis, float y) : GatherBase(axis, y) {} + GatherFunctor(OpKernelContext *context, int axis, float y) + : GatherBase(context, axis, y) {} MaceStatus operator()(const Tensor *params, const Tensor *indices, diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc index c94c0af5..5043a104 100644 --- a/mace/kernels/gemm.cc +++ b/mace/kernels/gemm.cc @@ -1341,8 +1341,8 @@ void Gemm(const float *A, ik_begin = bk * block_size_k + (bk < remain_k ? bk : remain_k); const index_t ik_end = std::min(K, ik_begin + this_block_size_k); - Tensor trans_a; - Tensor trans_b; + Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT); const float *real_a = nullptr; const float *real_b = nullptr; float *real_c = c_base + (ih_begin * width + iw_begin); @@ -1399,8 +1399,8 @@ void GemmRef(const float *A, const bool transpose_b) { memset(C, 0, sizeof(float) * batch * height * width); - Tensor trans_a; - Tensor trans_b; + Tensor trans_a(GetCPUAllocator(), DataType::DT_FLOAT); + Tensor trans_b(GetCPUAllocator(), DataType::DT_FLOAT); float *trans_a_data = nullptr; float *trans_b_data = nullptr; if (transpose_a) { diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index 4e6b057f..c4394fda 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -20,21 +20,24 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/opencl/common.h" namespace mace { namespace kernels { -struct ImageToBufferFunctorBase { - explicit ImageToBufferFunctorBase(const int wino_blk_size) - : wino_blk_size_(wino_blk_size) {} +struct ImageToBufferFunctorBase : OpKernel { + ImageToBufferFunctorBase(OpKernelContext *context, + const int wino_blk_size) + : OpKernel(context), + wino_blk_size_(wino_blk_size) {} const int wino_blk_size_; }; template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - explicit ImageToBufferFunctor(const int wino_blk_size) - : ImageToBufferFunctorBase(wino_blk_size) {} + ImageToBufferFunctor(OpKernelContext *context, const int wino_blk_size) + : ImageToBufferFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -50,8 +53,9 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - explicit ImageToBufferFunctor(const int wino_blk_size) - : ImageToBufferFunctorBase(wino_blk_size) {} + ImageToBufferFunctor(OpKernelContext *context, + const int wino_blk_size) + : ImageToBufferFunctorBase(context, wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/kernel.h b/mace/kernels/kernel.h new file mode 100644 index 00000000..853e974f --- /dev/null +++ b/mace/kernels/kernel.h @@ -0,0 +1,31 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_KERNEL_H_ +#define MACE_KERNELS_KERNEL_H_ + +#include "mace/core/op_kernel_context.h" + +namespace mace { +namespace kernels { + +struct OpKernel { + explicit OpKernel(OpKernelContext *context): context_(context) {} + + OpKernelContext *context_; +}; + +} // namespace kernels +} // namespace mace +#endif // MACE_KERNELS_KERNEL_H_ diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.h index 0af86327..d9eeb7db 100644 --- a/mace/kernels/local_response_norm.h +++ b/mace/kernels/local_response_norm.h @@ -21,7 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/public/mace.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -34,7 +34,9 @@ template struct LocalResponseNormFunctor; template<> -struct LocalResponseNormFunctor { +struct LocalResponseNormFunctor : OpKernel { + explicit LocalResponseNormFunctor(OpKernelContext *context) + : OpKernel(context) {} MaceStatus operator()(const Tensor *input, int depth_radius, float bias, diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h index 46439fae..cb6b86fd 100644 --- a/mace/kernels/lstmcell.h +++ b/mace/kernels/lstmcell.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #if defined(MACE_ENABLE_NEON) #include @@ -35,9 +36,10 @@ template struct LSTMCellFunctor; template -struct LSTMCellFunctor { - explicit LSTMCellFunctor(T forget_bias) : - forget_bias_(static_cast(forget_bias)) {} +struct LSTMCellFunctor : OpKernel{ + LSTMCellFunctor(OpKernelContext *context, T forget_bias) + : OpKernel(context), + forget_bias_(static_cast(forget_bias)) {} MaceStatus operator()(const Tensor *input, const Tensor *pre_output, const Tensor *weight, diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 42e76002..4b6c5cf1 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -29,6 +29,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/gemm.h" +#include "mace/kernels/kernel.h" #include "mace/utils/utils.h" #include "mace/kernels/gemmlowp_util.h" @@ -40,7 +41,8 @@ namespace mace { namespace kernels { template -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *A, const Tensor *B, Tensor *C, @@ -87,7 +89,7 @@ struct MatMulFunctor { // A * B = (B^T * A^T)^T if (!transpose_b) { if (B_transpose_.get() == nullptr) { - B_transpose_.reset(new Tensor(GetDeviceAllocator(D), + B_transpose_.reset(new Tensor(context_->device()->allocator(), DataTypeToEnum::v())); B_transpose_->Resize({batch, width, K}); Tensor::MappingGuard guardbt(B_transpose_.get()); @@ -112,7 +114,8 @@ struct MatMulFunctor { }; template <> -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} template void MatMulImpl(const Tensor *A, const Tensor *B, @@ -208,7 +211,8 @@ struct MatMulFunctor { #ifdef MACE_ENABLE_OPENCL template -struct MatMulFunctor { +struct MatMulFunctor : OpKernel { + explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *A, const Tensor *B, Tensor *C, diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc index 2cd0c2a3..7757758c 100644 --- a/mace/kernels/opencl/activation.cc +++ b/mace/kernels/opencl/activation.cc @@ -33,11 +33,11 @@ MaceStatus ActivationFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); @@ -94,12 +94,12 @@ MaceStatus ActivationFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, gws, - lws, future)); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, + gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index f01baa71..7c1c1afc 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -34,7 +34,7 @@ MaceStatus AddNFunctor::operator()( const index_t width = input_tensors[0]->dim(2); const index_t channels = input_tensors[0]->dim(3); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); for (size_t i = 1; i < size; ++i) { MACE_CHECK_NOTNULL(input_tensors[i]); @@ -49,7 +49,7 @@ MaceStatus AddNFunctor::operator()( MACE_NOT_IMPLEMENTED; } std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn"); @@ -96,7 +96,7 @@ MaceStatus AddNFunctor::operator()( std::string tuning_key = Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index e26065d9..446a26cc 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -44,11 +44,11 @@ MaceStatus BatchNormFunctor::operator()( static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); @@ -101,11 +101,11 @@ MaceStatus BatchNormFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("batch_norm_opencl_kernel", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc index aaa0d172..eae22c00 100644 --- a/mace/kernels/opencl/bias_add.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -39,12 +39,12 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; auto dt = DataTypeToEnum::value; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); @@ -65,7 +65,7 @@ MaceStatus BiasAddFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); cl::Event event; cl_int error; diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index c95ef0ad..75d0c4f5 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -75,12 +75,12 @@ MaceStatus BufferToImageFunctor::operator()( } } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index d7434683..64de09c2 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -41,11 +41,11 @@ MaceStatus ChannelShuffleFunctor::operator()( static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); @@ -72,11 +72,11 @@ MaceStatus ChannelShuffleFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 58b27faa..6fa4ba8f 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -22,13 +22,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(base, kwg_size / lws[1]); @@ -41,7 +43,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -static MaceStatus Concat2(cl::Kernel *kernel, +static MaceStatus Concat2(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, const DataType dt, @@ -61,11 +64,11 @@ static MaceStatus Concat2(cl::Kernel *kernel, static_cast(batch * height), }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); @@ -100,17 +103,18 @@ static MaceStatus Concat2(cl::Kernel *kernel, *prev_input_shape = input0->shape(); } - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("concat_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; } -static MaceStatus ConcatN(cl::Kernel *kernel, +static MaceStatus ConcatN(OpKernelContext *context, + cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, @@ -121,11 +125,11 @@ static MaceStatus ConcatN(cl::Kernel *kernel, const index_t height = output->dim(1); const index_t width = output->dim(2); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); @@ -148,7 +152,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, static_cast(input_channel_blk), static_cast(width), static_cast(batch * height), }; - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); uint32_t idx = 0; OUT_OF_RANGE_SET_ARG_PTR; @@ -168,8 +172,6 @@ static MaceStatus ConcatN(cl::Kernel *kernel, for (size_t j = 0; j < 3; ++j) { roundup_gws[j] = RoundUp(gws[j], lws[j]); } - const std::vector lws = LocalWS(gws, *kwg_size); - error = runtime->command_queue().enqueueNDRangeKernel( *kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), @@ -187,7 +189,7 @@ static MaceStatus ConcatN(cl::Kernel *kernel, } } if (future != nullptr) { - future->wait_fn = [runtime, call_stats](CallStats *stats) { + future->wait_fn = [call_stats](CallStats *stats) { if (stats != nullptr) { stats->start_micros = call_stats.start_micros; stats->end_micros = stats->start_micros + call_stats.end_micros; @@ -234,12 +236,14 @@ MaceStatus ConcatFunctor::operator()( switch (inputs_count) { case 2: - return Concat2(&kernel_, input_list[0], input_list[1], + return Concat2(context_, + &kernel_, input_list[0], input_list[1], DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); default: if (divisible_four) { - return ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, + return ConcatN(context_, + &kernel_, input_list, DataTypeToEnum::value, output, future, &kwg_size_, &kernel_error_); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc index 6221382e..bc8538b7 100644 --- a/mace/kernels/opencl/conv_2d.cc +++ b/mace/kernels/opencl/conv_2d.cc @@ -18,7 +18,8 @@ namespace mace { namespace kernels { -extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -34,7 +35,8 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, uint32_t *kwg_size, std::unique_ptr *kernel_error); -extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -50,7 +52,8 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, uint32_t *kwg_size, std::unique_ptr *kernel_error); -extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, +extern MaceStatus Conv2dOpencl(OpKernelContext *runtime, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -73,9 +76,10 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, Tensor *output, StatsFuture *future) { typedef MaceStatus (*Conv2dOpenclFunction)( - cl::Kernel * kernel, const Tensor *input, const Tensor *filter, - const Tensor *bias, const int stride, const int *padding, - const int *dilations, const ActivationType activation, + OpKernelContext *runtime, cl::Kernel * kernel, const Tensor *input, + const Tensor *filter, const Tensor *bias, const int stride, + const int *padding, const int *dilations, + const ActivationType activation, const float relux_max_limit, const DataType dt, std::vector *input_shape, Tensor *output, StatsFuture *future, uint32_t *kwg_size, std::unique_ptr *kernel_error); @@ -116,12 +120,12 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, if (kernel_h == kernel_w && kernel_h <= 3 && selector[kernel_h - 1] != nullptr) { auto conv2d_func = selector[kernel_h - 1]; - return conv2d_func( + return conv2d_func(context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); } else { - return Conv2dOpencl( + return Conv2dOpencl(context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index 770f0606..c43c0450 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -25,14 +25,16 @@ namespace { const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; // TODO(liuqi): Fix the specific value. const uint32_t lws_limit = 128; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + cache_size = runtime->device_global_mem_cache_size(); + uint32_t compute_units = runtime->device_compute_units(); const uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); @@ -62,7 +64,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -92,13 +95,13 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { MACE_CHECK(input_batch == batch); std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1"); built_options.emplace("-Dconv_2d_1x1=" + kernel_name); @@ -160,11 +163,11 @@ extern MaceStatus Conv2dOpenclK1x1(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - std::vector lws = LocalWS(gws, *kwg_size); + std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 02df4ea1..c0362831 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -24,15 +24,17 @@ namespace kernels { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t compute_units = std::max( - OpenCLRuntime::Global()->device_compute_units() / 2, 1); + runtime->device_compute_units() / 2, 1); const uint32_t base = std::max( std::min(cache_size / kBaseGPUMemCacheSize, 4), 1); @@ -55,7 +57,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, +extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -80,11 +83,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv(width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); @@ -147,11 +150,11 @@ extern MaceStatus Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - std::vector lws = LocalWS(gws, *kwg_size); + std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index fa2c9774..bac1da8f 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -26,7 +26,8 @@ namespace { const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; // TODO(liuqi): Fix the specific value. const uint32_t lws_limit = 20; -std::vector LocalWS(const uint32_t *gws, +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kernel_size, const uint32_t kwg_size) { std::vector lws(4, 0); @@ -34,8 +35,8 @@ std::vector LocalWS(const uint32_t *gws, lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + cache_size = runtime->device_global_mem_cache_size(); + uint32_t compute_units = runtime->device_compute_units(); const uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); @@ -64,7 +65,8 @@ std::vector LocalWS(const uint32_t *gws, } // namespace -extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, +extern MaceStatus Conv2dOpencl(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -89,11 +91,11 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t width_blocks = RoundUpDiv4(width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); @@ -162,8 +164,8 @@ extern MaceStatus Conv2dOpencl(cl::Kernel *kernel, Concat("conv2d_general_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3), filter->dim(2), filter->dim(3)); std::vector lws = - LocalWS(gws, filter->dim(2) * filter->dim(3), *kwg_size); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc index 651b2ef8..fce91d2b 100644 --- a/mace/kernels/opencl/crop.cc +++ b/mace/kernels/opencl/crop.cc @@ -22,13 +22,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(base, kwg_size / lws[1]); @@ -132,11 +134,11 @@ MaceStatus CropFunctor::operator()( static_cast(output->dim(0) * output->dim(1)) }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("crop"); built_options.emplace("-Dcrop=" + kernel_name); @@ -167,11 +169,11 @@ MaceStatus CropFunctor::operator()( input_shape_ = input0->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("crop_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc index cba8cbce..197b305e 100644 --- a/mace/kernels/opencl/deconv_2d.cc +++ b/mace/kernels/opencl/deconv_2d.cc @@ -20,7 +20,8 @@ namespace kernels { namespace { -MaceStatus Deconv2dOpencl(cl::Kernel *kernel, +MaceStatus Deconv2dOpencl(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -58,11 +59,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, const int align_w = stride_w - 1 - padding_w; const int kernel_size = filter->dim(2) * filter->dim(3); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("deconv_2d"); built_options.emplace("-Ddeconv_2d=" + kernel_name); @@ -133,11 +134,11 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, *kwg_size); + const std::vector lws = Default3DLocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("deconv2d_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -192,9 +193,10 @@ MaceStatus Deconv2dFunctor::operator()( &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - return Deconv2dOpencl(&kernel_, input, filter, bias, strides_.data(), - paddings.data(), activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future, + return Deconv2dOpencl(context_, &kernel_, input, filter, bias, + strides_.data(), paddings.data(), activation_, + relux_max_limit_, DataTypeToEnum::value, + &input_shape_, output, future, &kwg_size_, &kernel_error_); } diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc index 4c1fd3be..f5427af1 100644 --- a/mace/kernels/opencl/depth_to_space.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -72,11 +72,11 @@ MaceStatus DepthToSpaceOpFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -119,8 +119,8 @@ MaceStatus DepthToSpaceOpFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 3c97a288..1bc910fd 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -24,13 +24,15 @@ namespace kernels { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = cache_size / kBaseGPUMemCacheSize; lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -58,7 +60,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { } // namespace -static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, +static MaceStatus DepthwiseConv2d(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, // NHWC const Tensor *filter, // HWIM const Tensor *bias, @@ -89,11 +92,11 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, static_cast(width_blocks), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { @@ -170,10 +173,10 @@ static MaceStatus DepthwiseConv2d(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = LocalWS(gws, *kwg_size); + const std::vector lws = LocalWS(runtime, gws, *kwg_size); std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -190,14 +193,10 @@ MaceStatus DepthwiseConv2dFunctor::operator()( index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); if (strides_[0] != strides_[1]) { - LOG(WARNING) << "OpenCL depthwise conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << " is not implemented yet, using slow version"; - // TODO(heliangliang) The CPU/NEON kernel should map the buffer - return DepthwiseConv2dFunctor( - strides_, padding_type_, paddings_, dilations_, activation_, - relux_max_limit_)(input, filter, bias, output, future); + LOG(FATAL) << "GPU depthwise conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides_[0] << "x" << strides_[1] + << " is not implemented yet."; } // Create a fake conv_2d filter to calculate the paddings and output size @@ -226,6 +225,7 @@ MaceStatus DepthwiseConv2dFunctor::operator()( MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); return DepthwiseConv2d( + context_, &kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_, &kernel_error_); diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc index 9eedf011..201639e3 100644 --- a/mace/kernels/opencl/eltwise.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -75,10 +75,10 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, static_cast(width), static_cast(batch_height_pixels)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); @@ -124,11 +124,11 @@ MaceStatus EltwiseFunctor::operator()(const Tensor *input0, input_shape_ = input0->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); return MACE_SUCCESS; diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc index dc8798a5..2af592c7 100644 --- a/mace/kernels/opencl/fully_connected.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -22,7 +22,8 @@ namespace kernels { namespace { template -MaceStatus FCWXKernel(cl::Kernel *kernel, +MaceStatus FCWXKernel(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *weight, const Tensor *bias, @@ -36,7 +37,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, std::unique_ptr *kernel_error) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { const index_t batch = output->dim(0); @@ -44,7 +45,7 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, const index_t output_blocks = RoundUpDiv4(output_size); std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected_width"); @@ -154,7 +155,8 @@ MaceStatus FCWXKernel(cl::Kernel *kernel, } template -MaceStatus FCWTXKernel(cl::Kernel *kernel, +MaceStatus FCWTXKernel(OpKernelContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *weight, const Tensor *bias, @@ -168,10 +170,10 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, std::unique_ptr *kernel_error) { MACE_CHECK_NOTNULL(gws); MACE_CHECK_NOTNULL(lws); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); if (kernel->get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(*kernel_error); + OUT_OF_RANGE_CONFIG(*kernel_error, context); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected"); @@ -236,7 +238,7 @@ MaceStatus FCWTXKernel(cl::Kernel *kernel, std::string tuning_key = Concat("fc_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(*kernel, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws->data(), *lws, future)); OUT_OF_RANGE_VALIDATION(*kernel_error); @@ -257,7 +259,8 @@ MaceStatus FullyConnectedFunctor::operator()( &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - return FCWXKernel(&kernel_, input, weight, bias, &input_shape_, output, + return FCWXKernel(context_, + &kernel_, input, weight, bias, &input_shape_, output, activation_, &gws_, &lws_, relux_max_limit_, future, &kernel_error_); } diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 6ef80c80..aa3daadb 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -226,14 +226,14 @@ std::string DtToUpCompatibleCLCMDDt(const DataType dt) { } } -std::vector Default3DLocalWS(const uint32_t *gws, +std::vector Default3DLocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { - uint64_t cache_size = - OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint64_t cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[2] = @@ -245,13 +245,12 @@ std::vector Default3DLocalWS(const uint32_t *gws, return lws; } -MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); - auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -366,29 +365,28 @@ MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, } return error; }; - OpenCLProfilingTimer timer(&event); - cl_int err = Tuner::Get()->template TuneOrRun( + OpenCLProfilingTimer timer(runtime, &event); + cl_int err = runtime->tuner()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); MACE_CL_RET_STATUS(err); if (future != nullptr) { - future->wait_fn = [event](CallStats *stats) { + future->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { - OpenCLRuntime::Global()->GetCallStats(event, stats); + runtime->GetCallStats(event, stats); } }; } return MaceStatus::MACE_SUCCESS; } -MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); - auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); @@ -475,8 +473,8 @@ MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, } return error; }; - OpenCLProfilingTimer timer(&event); - cl_int err = Tuner::Get()->template TuneOrRun( + OpenCLProfilingTimer timer(runtime, &event); + cl_int err = runtime->tuner()->template TuneOrRun( tuning_key, lws, params_generator, func, &timer); MACE_CL_RET_STATUS(err); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5d4bf410..d9e309bc 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -31,11 +31,11 @@ namespace mace { namespace kernels { -#define OUT_OF_RANGE_CONFIG(kernel_error) \ +#define OUT_OF_RANGE_CONFIG(kernel_error, context) \ if (runtime->IsOutOfRangeCheckEnabled()) { \ built_options.emplace("-DOUT_OF_RANGE_CHECK"); \ (kernel_error) = std::move(std::unique_ptr( \ - new Buffer(GetDeviceAllocator(DeviceType::GPU)))); \ + new Buffer((context)->device()->allocator()))); \ MACE_RETURN_IF_ERROR((kernel_error)->Allocate(1)); \ (kernel_error)->Map(nullptr); \ *((kernel_error)->mutable_data()) = 0; \ @@ -115,14 +115,16 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpCompatibleCLDt(const DataType dt); // Tuning or Run OpenCL kernel with 3D work group size -MaceStatus TuningOrRun3DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future); // Tuning or Run OpenCL kernel with 2D work group size -MaceStatus TuningOrRun2DKernel(const cl::Kernel &kernel, +MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, + const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, @@ -162,7 +164,8 @@ std::string Concat(Args... args) { return ss.str(); } -std::vector Default3DLocalWS(const uint32_t *gws, +std::vector Default3DLocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, const uint32_t kwg_size); } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index 955b9ebe..b98e9fb2 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -67,12 +67,12 @@ MaceStatus ImageToBufferFunctor::operator()( break; } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc index ffc185d0..6704c0b4 100644 --- a/mace/kernels/opencl/lstmcell.cc +++ b/mace/kernels/opencl/lstmcell.cc @@ -38,11 +38,11 @@ MaceStatus LSTMCellFunctor::operator()( const index_t width = input->dim(1); const index_t width_blocks = width / 4; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("lstmcell"); @@ -88,7 +88,7 @@ MaceStatus LSTMCellFunctor::operator()( const std::vector lws = {kwg_size_ / 16, 16, 0}; std::string tuning_key = Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 4df9d58d..407b455d 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -53,11 +53,11 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, static_cast(height_blocks * batch), }; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); @@ -84,7 +84,7 @@ MaceStatus MatMulFunctor::operator()(const Tensor *A, const std::vector lws = {kwg_size_ / 64, 64, 0}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index d257fea2..03f05ca5 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -16,6 +16,8 @@ #include #include "gtest/gtest.h" +#include "mace/core/op_kernel_context.h" +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" @@ -25,14 +27,15 @@ namespace mace { namespace kernels { namespace { -bool BufferToImageOpImpl(Tensor *buffer, +bool BufferToImageOpImpl(OpKernelContext *context, + Tensor *buffer, Tensor *image, const std::vector &image_shape) { std::unique_ptr kernel_error; uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context->device()->opencl_runtime(); std::string kernel_name = "in_out_buffer_to_image"; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); @@ -40,7 +43,7 @@ bool BufferToImageOpImpl(Tensor *buffer, std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - OUT_OF_RANGE_CONFIG(kernel_error); + OUT_OF_RANGE_CONFIG(kernel_error, context); NON_UNIFORM_WG_CONFIG; if (buffer->dtype() == image->dtype()) { built_options.emplace("-DDATA_TYPE=" + @@ -127,25 +130,33 @@ TEST(OutOfRangeCheckTest, RandomTest) { index_t width = 7; index_t channels = 11; - std::vector buffer_shape = {batch, height, width, channels}; + GPUContext gpu_context; + std::unique_ptr device(new GPUDevice(gpu_context.opencl_tuner())); + Workspace ws; + OpKernelContext context(&ws, device.get()); + + std::vector buffer_shape = {batch, height, width, channels}; Tensor *buffer = - ws.CreateTensor("Buffer", GetDeviceAllocator(DeviceType::GPU), + ws.CreateTensor("Buffer", device->allocator(), DataTypeToEnum::v()); buffer->Resize(buffer_shape); std::vector image_shape; - Tensor *image = ws.CreateTensor("Image", GetDeviceAllocator(DeviceType::GPU), + Tensor *image = ws.CreateTensor("Image", device->allocator(), DataTypeToEnum::v()); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); image->ResizeImage(buffer->shape(), image_shape); - ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape)); + ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape)); std::vector overflow_image_shape = image_shape; for (size_t i = 0; i < overflow_image_shape.size(); ++i) { overflow_image_shape[i] += 1; } - ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape)); + ASSERT_TRUE(BufferToImageOpImpl(&context, + buffer, + image, + overflow_image_shape)); } } // namespace kernels diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 04e9d69d..a3f4cfaa 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -47,11 +47,11 @@ MaceStatus PadFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pad"); built_options.emplace("-Dpad=" + kernel_name); @@ -85,10 +85,10 @@ MaceStatus PadFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 18eb6e80..c6743750 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -23,13 +23,15 @@ namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[2] = @@ -54,12 +56,12 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); built_options.emplace("-Dpooling=" + kernel_name); @@ -149,11 +151,11 @@ MaceStatus PoolingFunctor::operator()(const Tensor *input, }; } - const std::vector lws = LocalWS(gws.data(), kwg_size_); + const std::vector lws = LocalWS(runtime, gws.data(), kwg_size_); std::string tuning_key = Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws.data(), lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc index 075632c5..a6a45f76 100644 --- a/mace/kernels/opencl/reduce_mean.cc +++ b/mace/kernels/opencl/reduce_mean.cc @@ -39,7 +39,7 @@ MaceStatus ReduceMeanFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); const uint32_t image_size = static_cast(in_height * in_width); - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); std::vector gws(3); std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; @@ -50,7 +50,7 @@ MaceStatus ReduceMeanFunctor::operator()( if (kernel_.get() == nullptr) { const DataType dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("reduce_mean"); built_options.emplace("-Dreduce_mean=" + kernel_name); diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc index f8a33383..6fc26e52 100644 --- a/mace/kernels/opencl/resize_bicubic.cc +++ b/mace/kernels/opencl/resize_bicubic.cc @@ -23,9 +23,11 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint64_t cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -65,15 +67,15 @@ MaceStatus ResizeBicubicFunctor::operator()( static_cast(out_width), static_cast(out_height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { + auto dt = DataTypeToEnum::value; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bicubic_nocache"); built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); - auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); @@ -115,11 +117,11 @@ MaceStatus ResizeBicubicFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index 0b297dd2..23e5db1c 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -23,13 +23,15 @@ namespace mace { namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { @@ -70,11 +72,11 @@ MaceStatus ResizeBilinearFunctor::operator()( static_cast(out_width), static_cast(out_height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); @@ -118,11 +120,11 @@ MaceStatus ResizeBilinearFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = LocalWS(gws, kwg_size_); + const std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index f401b827..e84ec731 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -24,13 +24,15 @@ namespace kernels { namespace { -std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { +std::vector LocalWS(OpenCLRuntime *runtime, + const uint32_t *gws, + const uint32_t kwg_size) { std::vector lws(4, 0); if (kwg_size == 0) { lws[0] = lws[1] = lws[2] = 1; } else { uint64_t - cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + cache_size = runtime->device_global_mem_cache_size(); uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (gws[0] < base) { @@ -78,11 +80,11 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, static_cast(width), static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); @@ -107,10 +109,10 @@ MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, input_shape_ = logits->shape(); } - std::vector lws = LocalWS(gws, kwg_size_); + std::vector lws = LocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("softmax_opencl_kernel", batch, height, width, channels); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc index c31b2d69..8794dd2a 100644 --- a/mace/kernels/opencl/space_to_batch.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -54,12 +54,12 @@ MaceStatus SpaceToBatchFunctor::operator()( chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; @@ -99,11 +99,11 @@ MaceStatus SpaceToBatchFunctor::operator()( space_shape_ = space_tensor->shape(); } - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), batch_tensor->dim(2), batch_tensor->dim(3)); - MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc index 65fd6be5..c445b783 100644 --- a/mace/kernels/opencl/split.cc +++ b/mace/kernels/opencl/split.cc @@ -40,11 +40,11 @@ MaceStatus SplitFunctor::operator()( output_list[i]->ResizeImage(output_shape, image_shape)); } - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("split"); built_options.emplace("-Dsplit=" + kernel_name); @@ -66,7 +66,7 @@ MaceStatus SplitFunctor::operator()( static_cast(input->dim(0) * input->dim(1)), }; - const std::vector lws = Default3DLocalWS(gws, kwg_size_); + const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); cl::Event event; CallStats call_stats{INT64_MAX, 0}; for (size_t i = 0; i < outputs_count; ++i) { diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 74d8776f..43210171 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -24,12 +24,12 @@ namespace kernels { template MaceStatus WinogradTransformFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = @@ -120,7 +120,7 @@ MaceStatus WinogradTransformFunctor::operator()( output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); @@ -132,7 +132,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( const std::vector &inputs, Tensor *output_tensor, StatsFuture *future) { - auto runtime = OpenCLRuntime::Global(); + auto runtime = context_->device()->opencl_runtime(); const Tensor *input_tensor = inputs[0]; const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; @@ -140,7 +140,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( if (kernel_.get() == nullptr) { std::string obfuscated_kernel_name; std::set built_options; - OUT_OF_RANGE_CONFIG(kernel_error_); + OUT_OF_RANGE_CONFIG(kernel_error_, context_); NON_UNIFORM_WG_CONFIG; if (wino_blk_size_ == 4) { obfuscated_kernel_name = @@ -241,7 +241,7 @@ MaceStatus WinogradInverseTransformFunctor::operator()( Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3), input_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(kernel_, tuning_key, + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, gws, lws, future)); OUT_OF_RANGE_VALIDATION(kernel_error_); diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h index de851bb7..14a4c8d6 100644 --- a/mace/kernels/pad.h +++ b/mace/kernels/pad.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -29,10 +30,13 @@ namespace mace { namespace kernels { -struct PadFunctorBase { - PadFunctorBase(const std::vector &paddings, +struct PadFunctorBase : OpKernel { + PadFunctorBase(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : paddings_(paddings), constant_value_(constant_value) {} + : OpKernel(context), + paddings_(paddings), + constant_value_(constant_value) {} std::vector paddings_; float constant_value_; @@ -40,9 +44,10 @@ struct PadFunctorBase { template struct PadFunctor : public PadFunctorBase { - PadFunctor(const std::vector &paddings, + PadFunctor(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : PadFunctorBase(paddings, constant_value) {} + : PadFunctorBase(context, paddings, constant_value) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -93,9 +98,10 @@ struct PadFunctor : public PadFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct PadFunctor : PadFunctorBase { - PadFunctor(const std::vector &paddings, + PadFunctor(OpKernelContext *context, + const std::vector &paddings, const float constant_value) - : PadFunctorBase(paddings, constant_value) {} + : PadFunctorBase(context, paddings, constant_value) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 94a388be..c6174528 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -23,6 +23,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/kernel.h" #if defined(MACE_ENABLE_NEON) #include @@ -41,14 +42,16 @@ enum PoolingType { namespace kernels { -struct PoolingFunctorBase { - PoolingFunctorBase(const PoolingType pooling_type, +struct PoolingFunctorBase : OpKernel { + PoolingFunctorBase(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : pooling_type_(pooling_type), + : OpKernel(context), + pooling_type_(pooling_type), kernels_(kernels), strides_(strides), padding_type_(padding_type), @@ -68,14 +71,20 @@ struct PoolingFunctor; template <> struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) { } void MaxPooling(const float *input, @@ -231,15 +240,20 @@ struct PoolingFunctor: PoolingFunctorBase { template <> struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { - } + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) {} void MaxPooling(const uint8_t *input, const index_t *in_shape, @@ -443,14 +457,20 @@ struct PoolingFunctor: PoolingFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct PoolingFunctor : PoolingFunctorBase { - PoolingFunctor(const PoolingType pooling_type, + PoolingFunctor(OpKernelContext *context, + const PoolingType pooling_type, const int *kernels, const int *strides, const Padding padding_type, const std::vector &paddings, const int *dilations) - : PoolingFunctorBase( - pooling_type, kernels, strides, padding_type, paddings, dilations) { + : PoolingFunctorBase(context, + pooling_type, + kernels, + strides, + padding_type, + paddings, + dilations) { } MaceStatus operator()(const Tensor *input_tensor, Tensor *output_tensor, diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h index 89f79b7f..aa002988 100644 --- a/mace/kernels/proposal.h +++ b/mace/kernels/proposal.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { @@ -121,8 +122,9 @@ inline std::vector nms(const float *bboxes_ptr, template -struct ProposalFunctor { - ProposalFunctor(const int min_size, +struct ProposalFunctor : OpKernel { + ProposalFunctor(OpKernelContext *context, + const int min_size, const float nms_thresh, const int pre_nms_top_n, const int post_nms_top_n, @@ -130,6 +132,7 @@ struct ProposalFunctor { const int base_size, const std::vector &scales, const std::vector &ratios) : + OpKernel(context), min_size_(min_size), thresh_(nms_thresh), pre_nms_top_n_(pre_nms_top_n), diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h index 1f1cb8d1..fe52e8d8 100644 --- a/mace/kernels/quantize.h +++ b/mace/kernels/quantize.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { @@ -173,8 +174,8 @@ template struct QuantizeFunctor; template<> -struct QuantizeFunctor { - QuantizeFunctor() {} +struct QuantizeFunctor : OpKernel { + explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const bool non_zero, @@ -212,8 +213,8 @@ template struct DequantizeFunctor; template<> -struct DequantizeFunctor { - DequantizeFunctor() {} +struct DequantizeFunctor : OpKernel { + explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.h index 65dc67d9..71fc2de0 100644 --- a/mace/kernels/reduce_mean.h +++ b/mace/kernels/reduce_mean.h @@ -24,6 +24,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" #endif @@ -31,10 +32,12 @@ namespace mace { namespace kernels { -struct ReduceFunctorBase { - ReduceFunctorBase(const std::vector &axis, +struct ReduceFunctorBase : OpKernel { + ReduceFunctorBase(OpKernelContext *context, + const std::vector &axis, const bool keep_dims) - : keep_dims_(keep_dims), + : OpKernel(context), + keep_dims_(keep_dims), axis_(axis) {} bool keep_dims_; bool reduce_first_axis_; @@ -44,10 +47,11 @@ struct ReduceFunctorBase { }; template -struct ReduceMeanFunctor : ReduceFunctorBase{ - ReduceMeanFunctor(const std::vector &axis, +struct ReduceMeanFunctor : ReduceFunctorBase { + ReduceMeanFunctor(OpKernelContext *context, + const std::vector &axis, const bool keep_dims) - : ReduceFunctorBase(axis, keep_dims) {} + : ReduceFunctorBase(context, axis, keep_dims) {} void Simplify(const Tensor *input) { std::vector bitmap(static_cast(input->dim_size()), false); @@ -220,9 +224,10 @@ struct ReduceMeanFunctor : ReduceFunctorBase{ template struct ReduceMeanFunctor : ReduceFunctorBase { - ReduceMeanFunctor(const std::vector axis, + ReduceMeanFunctor(OpKernelContext *context, + const std::vector axis, const bool keep_dims) - : ReduceFunctorBase(axis, keep_dims) {} + : ReduceFunctorBase(context, axis, keep_dims) {} MaceStatus operator()(const Tensor *input, Tensor *output_tensor, diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h index cfa7bb2e..f0ab1bf5 100644 --- a/mace/kernels/reshape.h +++ b/mace/kernels/reshape.h @@ -19,17 +19,14 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL +#include "mace/kernels/kernel.h" namespace mace { namespace kernels { template -struct ReshapeFunctor { - ReshapeFunctor() {} +struct ReshapeFunctor : OpKernel { + explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, const std::vector &out_shape, diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index 046c6bb1..bc8c59e4 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/utils/logging.h" #ifdef MACE_ENABLE_OPENCL @@ -136,10 +137,11 @@ inline void ResizeImage(const float *images, } } -struct ResizeBicubicFunctorBase { - ResizeBicubicFunctorBase(const std::vector &size, - bool align_corners) - : align_corners_(align_corners) { +struct ResizeBicubicFunctorBase : OpKernel { + ResizeBicubicFunctorBase(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : OpKernel(context), align_corners_(align_corners) { MACE_CHECK(size.size() == 2); out_height_ = size[0]; out_width_ = size[1]; @@ -157,8 +159,10 @@ struct ResizeBicubicFunctor; template<> struct ResizeBicubicFunctor : ResizeBicubicFunctorBase { - ResizeBicubicFunctor(const std::vector &size, bool align_corners) - : ResizeBicubicFunctorBase(size, align_corners) {} + ResizeBicubicFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBicubicFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -203,8 +207,10 @@ struct ResizeBicubicFunctor template struct ResizeBicubicFunctor : ResizeBicubicFunctorBase { - ResizeBicubicFunctor(const std::vector &size, bool align_corners) - : ResizeBicubicFunctorBase(size, align_corners) {} + ResizeBicubicFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBicubicFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index cb41ef45..92e57b4f 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -21,6 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -113,10 +114,12 @@ inline void ResizeImage(const float *images, } } -struct ResizeBilinearFunctorBase { - ResizeBilinearFunctorBase(const std::vector &size, +struct ResizeBilinearFunctorBase : OpKernel { + ResizeBilinearFunctorBase(OpKernelContext *context, + const std::vector &size, bool align_corners) - : align_corners_(align_corners) { + : OpKernel(context), + align_corners_(align_corners) { MACE_CHECK(size.size() == 2); out_height_ = size[0]; out_width_ = size[1]; @@ -134,8 +137,10 @@ struct ResizeBilinearFunctor; template<> struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { - ResizeBilinearFunctor(const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(size, align_corners) {} + ResizeBilinearFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBilinearFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -187,8 +192,10 @@ struct ResizeBilinearFunctor template struct ResizeBilinearFunctor : ResizeBilinearFunctorBase { - ResizeBilinearFunctor(const std::vector &size, bool align_corners) - : ResizeBilinearFunctorBase(size, align_corners) {} + ResizeBilinearFunctor(OpKernelContext *context, + const std::vector &size, + bool align_corners) + : ResizeBilinearFunctorBase(context, size, align_corners) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/scalar_math.h b/mace/kernels/scalar_math.h index 60430207..928a4954 100644 --- a/mace/kernels/scalar_math.h +++ b/mace/kernels/scalar_math.h @@ -89,12 +89,14 @@ void ScalarEltwise(const T* in0, template -struct ScalarMathFunctor { - explicit ScalarMathFunctor(const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index) - : type_(type), +struct ScalarMathFunctor : OpKernel { + ScalarMathFunctor(OpKernelContext *context, + const EltwiseType type, + const std::vector &coeff, + const float scalar_input, + const int32_t scalar_input_index) + : OpKernel(context), + type_(type), coeff_(coeff), scalar_input_(scalar_input), scalar_input_index_(scalar_input_index) {} diff --git a/mace/kernels/sgemm.h b/mace/kernels/sgemm.h index 15cec1dd..3aaf5d47 100644 --- a/mace/kernels/sgemm.h +++ b/mace/kernels/sgemm.h @@ -89,7 +89,7 @@ typedef Major PackOrder; template class PackedBlock { public: - PackedBlock() : data_tensor_(GetDeviceAllocator(CPU), + PackedBlock() : data_tensor_(GetCPUAllocator(), DataTypeToEnum::v()) {} const T *data() { diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index 5de3ade1..0c2c9126 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -27,6 +27,7 @@ #include "mace/utils/utils.h" #include "mace/kernels/fixpoint.h" #include "mace/kernels/gemmlowp_util.h" +#include "mace/kernels/kernel.h" #include "mace/kernels/quantize.h" #ifdef MACE_ENABLE_OPENCL @@ -40,7 +41,8 @@ template struct SoftmaxFunctor; template<> -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -127,7 +129,8 @@ static const int kInputDeltaIntBits = 6; static const int kSumExpIntBits = 12; template<> -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *input, Tensor *output, StatsFuture *future) { @@ -354,7 +357,8 @@ struct SoftmaxFunctor { #ifdef MACE_ENABLE_OPENCL template -struct SoftmaxFunctor { +struct SoftmaxFunctor : OpKernel { + explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} MaceStatus operator()(const Tensor *logits, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 786e270a..7670632a 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -21,7 +21,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" -#include "mace/public/mace.h" +#include "mace/kernels/kernel.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/cl2_header.h" @@ -30,11 +30,13 @@ namespace mace { namespace kernels { -struct SpaceToBatchFunctorBase { - SpaceToBatchFunctorBase(const std::vector &paddings, +struct SpaceToBatchFunctorBase : OpKernel { + SpaceToBatchFunctorBase(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : paddings_(paddings.begin(), paddings.end()), + : OpKernel(context), + paddings_(paddings.begin(), paddings.end()), block_shape_(block_shape.begin(), block_shape.end()), b2s_(b2s) { MACE_CHECK( @@ -135,10 +137,11 @@ struct SpaceToBatchFunctor; template<> struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(const std::vector &paddings, + SpaceToBatchFunctor(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} + : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, @@ -319,10 +322,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(const std::vector &paddings, + SpaceToBatchFunctor(OpKernelContext *context, + const std::vector &paddings, const std::vector &block_shape, bool b2s) - : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {} + : SpaceToBatchFunctorBase(context, paddings, block_shape, b2s) {} MaceStatus operator()(Tensor *space_tensor, Tensor *batch_tensor, diff --git a/mace/kernels/split.h b/mace/kernels/split.h index 95ff7861..899e74da 100644 --- a/mace/kernels/split.h +++ b/mace/kernels/split.h @@ -22,6 +22,7 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" #include "mace/core/types.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -31,15 +32,17 @@ namespace mace { namespace kernels { -struct SplitFunctorBase { - explicit SplitFunctorBase(const int32_t axis) : axis_(axis) {} +struct SplitFunctorBase : OpKernel { + SplitFunctorBase(OpKernelContext *context, const int32_t axis) + : OpKernel(context), axis_(axis) {} int32_t axis_; }; template struct SplitFunctor : SplitFunctorBase { - explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} + SplitFunctor(OpKernelContext *context, const int32_t axis) + : SplitFunctorBase(context, axis) {} MaceStatus operator()(const Tensor *input, const std::vector &output_list, @@ -90,11 +93,12 @@ struct SplitFunctor : SplitFunctorBase { #ifdef MACE_ENABLE_OPENCL template struct SplitFunctor : SplitFunctorBase { - explicit SplitFunctor(const int32_t axis) : SplitFunctorBase(axis) {} + SplitFunctor(OpKernelContext *context, const int32_t axis) + : SplitFunctorBase(context, axis) {} MaceStatus operator()(const Tensor *input, - const std::vector &output_list, - StatsFuture *future); + const std::vector &output_list, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; std::unique_ptr kernel_error_; diff --git a/mace/kernels/stack.h b/mace/kernels/stack.h index 9a84bed0..4d465784 100644 --- a/mace/kernels/stack.h +++ b/mace/kernels/stack.h @@ -22,14 +22,16 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct StackFunctor { - explicit StackFunctor(int axis) : axis_(axis) {} +struct StackFunctor : OpKernel { + StackFunctor(OpKernelContext *context, int axis) + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, diff --git a/mace/kernels/strided_slice.h b/mace/kernels/strided_slice.h index a6afb46c..a5d0eb38 100644 --- a/mace/kernels/strided_slice.h +++ b/mace/kernels/strided_slice.h @@ -21,26 +21,29 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct StridedSliceFunctor { - StridedSliceFunctor(int begin_mask, +struct StridedSliceFunctor : OpKernel { + StridedSliceFunctor(OpKernelContext *context, + int begin_mask, int end_mask, int ellipsis_mask, int new_axis_mask, int shrink_axis_mask, bool is_slice) - : begin_mask_(begin_mask), + : OpKernel(context), + begin_mask_(begin_mask), end_mask_(end_mask), ellipsis_mask_(ellipsis_mask), new_axis_mask_(new_axis_mask), shrink_axis_mask_(shrink_axis_mask), is_slice_(is_slice), - tmp_strides_tensor_(GetDeviceAllocator(D), + tmp_strides_tensor_(context->device()->allocator(), DataTypeToEnum::v()) {} MaceStatus operator()(const Tensor *input, diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h index 8de796aa..87f9c0e2 100644 --- a/mace/kernels/transpose.h +++ b/mace/kernels/transpose.h @@ -105,8 +105,9 @@ static void TransposeNCHWToNHWCC2(const float *input, } template -struct TransposeFunctor { - explicit TransposeFunctor(const std::vector &dims) : dims_(dims) {} +struct TransposeFunctor : OpKernel { + TransposeFunctor(OpKernelContext *context, const std::vector &dims) + : OpKernel(context), dims_(dims) {} MaceStatus operator()(const Tensor *input, Tensor *output, diff --git a/mace/kernels/unstack.h b/mace/kernels/unstack.h index 82b5c467..b193c6b5 100644 --- a/mace/kernels/unstack.h +++ b/mace/kernels/unstack.h @@ -22,14 +22,16 @@ #include "mace/core/future.h" #include "mace/core/tensor.h" +#include "mace/kernels/kernel.h" #include "mace/public/mace.h" namespace mace { namespace kernels { template -struct UnstackFunctor { - explicit UnstackFunctor(int axis) : axis_(axis) {} +struct UnstackFunctor : OpKernel { + UnstackFunctor(OpKernelContext *context, int axis) + : OpKernel(context), axis_(axis) {} MaceStatus operator()(const Tensor *input, const std::vector &outputs, diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index c7d6fc1a..c2e267c4 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -30,11 +30,13 @@ namespace mace { namespace kernels { -struct WinogradTransformFunctorBase { - WinogradTransformFunctorBase(const Padding &padding_type, +struct WinogradTransformFunctorBase : OpKernel { + WinogradTransformFunctorBase(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : strides_({1, 1}), + : OpKernel(context), + strides_({1, 1}), dilations_({1, 1}), padding_type_(padding_type), paddings_(paddings), @@ -49,10 +51,14 @@ struct WinogradTransformFunctorBase { template struct WinogradTransformFunctor : WinogradTransformFunctorBase { - WinogradTransformFunctor(const Padding &padding_type, + WinogradTransformFunctor(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} + : WinogradTransformFunctorBase(context, + padding_type, + paddings, + block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -69,10 +75,14 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase { template struct WinogradTransformFunctor : WinogradTransformFunctorBase { - WinogradTransformFunctor(const Padding &padding_type, + WinogradTransformFunctor(OpKernelContext *context, + const Padding &padding_type, const std::vector &paddings, const int block_size) - : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} + : WinogradTransformFunctorBase(context, + padding_type, + paddings, + block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -85,11 +95,13 @@ struct WinogradTransformFunctor }; #endif // MACE_ENABLE_OPENCL -struct WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctorBase(const ActivationType activation, +struct WinogradInverseTransformFunctorBase : OpKernel { + WinogradInverseTransformFunctorBase(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) - : wino_blk_size_(block_size), + : OpKernel(context), + wino_blk_size_(block_size), activation_(activation), relux_max_limit_(relux_max_limit) {} @@ -100,11 +112,12 @@ struct WinogradInverseTransformFunctorBase { template struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctor(const ActivationType activation, + WinogradInverseTransformFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) : WinogradInverseTransformFunctorBase( - activation, relux_max_limit, block_size) {} + context, activation, relux_max_limit, block_size) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, @@ -121,11 +134,12 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { template struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { - WinogradInverseTransformFunctor(const ActivationType activation, + WinogradInverseTransformFunctor(OpKernelContext *context, + const ActivationType activation, const float relux_max_limit, const int block_size) : WinogradInverseTransformFunctorBase( - activation, relux_max_limit, block_size) {} + context, activation, relux_max_limit, block_size) {} MaceStatus operator()(const std::vector &inputs, Tensor *output, diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 65d7d03c..80a35943 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -21,10 +21,12 @@ #include #include "mace/core/net.h" +#include "mace/core/device_context.h" #include "mace/ops/ops_register.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #endif // MACE_ENABLE_OPENCL @@ -63,9 +65,9 @@ void UnloadModelData(const unsigned char *model_data, } #ifdef MACE_ENABLE_OPENCL -MaceStatus CheckGPUAvalibility(const NetDef *net_def) { +MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { // Check OpenCL avaliable - auto runtime = OpenCLRuntime::Global(); + auto runtime = device->opencl_runtime(); if (!runtime->is_opencl_avaliable()) { return MaceStatus::MACE_OUT_OF_RESOURCES; } @@ -101,6 +103,199 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def) { } // namespace +class GPUContextBuilder::Impl { + public: + void SetStoragePath(const std::string &path); + + void SetOpenCLBinaryPaths(const std::vector &paths); + + void SetOpenCLParameterPath(const std::string &path); + + std::shared_ptr Finalize(); + + public: + std::string storage_path_; + std::vector opencl_binary_paths_; + std::string opencl_parameter_path_; +}; + +void GPUContextBuilder::Impl::SetStoragePath(const std::string &path) { + storage_path_ = path; +} + +void GPUContextBuilder::Impl::SetOpenCLBinaryPaths( + const std::vector &paths) { + opencl_binary_paths_ = paths; +} + +void GPUContextBuilder::Impl::SetOpenCLParameterPath( + const std::string &path) { + opencl_parameter_path_ = path; +} + +std::shared_ptr GPUContextBuilder::Impl::Finalize() { + return std::shared_ptr(new GPUContext(storage_path_, + opencl_binary_paths_, + opencl_parameter_path_)); +} + +GPUContextBuilder::GPUContextBuilder() : impl_(new GPUContextBuilder::Impl) {} + +GPUContextBuilder::~GPUContextBuilder() = default; + +GPUContextBuilder &GPUContextBuilder::SetStoragePath(const std::string &path) { + impl_->SetStoragePath(path); + return *this; +} + +GPUContextBuilder &GPUContextBuilder::SetOpenCLBinaryPaths( + const std::vector &paths) { + impl_->SetOpenCLBinaryPaths(paths); + return *this; +} + +GPUContextBuilder &GPUContextBuilder::SetOpenCLParameterPath( + const std::string &path) { + impl_->SetOpenCLParameterPath(path); + return *this; +} + +std::shared_ptr GPUContextBuilder::Finalize() { + return impl_->Finalize(); +} + +class MaceEngineConfig::Impl { + public: + explicit Impl(const DeviceType device_type); + ~Impl() = default; + + MaceStatus SetGPUContext(std::shared_ptr context); + + MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); + + MaceStatus SetCPUThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp); + + MaceStatus SetOpenMPThreadAffinity(int num_threads, + const std::vector &cpu_ids); + + inline DeviceType device_type() const { + return device_type_; + } + + inline int num_threads() const { + return num_threads_; + } + + inline std::shared_ptr gpu_context() const { + return gpu_context_; + } + + inline GPUPriorityHint gpu_priority_hint() const { + return gpu_priority_hint_; + } + + inline GPUPerfHint gpu_perf_hint() const { + return gpu_perf_hint_; + } + + private: + DeviceType device_type_; + int num_threads_; + std::shared_ptr gpu_context_; + GPUPriorityHint gpu_priority_hint_; + GPUPerfHint gpu_perf_hint_; +}; + +MaceEngineConfig::Impl::Impl(const DeviceType device_type) + : device_type_(device_type), + num_threads_(-1), + gpu_context_(new GPUContext), + gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW), + gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {} + +MaceStatus MaceEngineConfig::Impl::SetGPUContext( + std::shared_ptr context) { + gpu_context_ = context; + return MACE_SUCCESS; +} + +MaceStatus MaceEngineConfig::Impl::SetGPUHints( + GPUPerfHint perf_hint, + GPUPriorityHint priority_hint) { + gpu_perf_hint_ = perf_hint; + gpu_priority_hint_ = priority_hint; + return MACE_SUCCESS; +} + +MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( + int num_threads, + CPUAffinityPolicy policy, + bool use_gemmlowp) { + num_threads_ = num_threads; + return mace::SetOpenMPThreadsAndAffinityPolicy( + num_threads, policy, use_gemmlowp); +} + +MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids) { + num_threads_ = num_threads; + return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids); +} + + +MaceEngineConfig::MaceEngineConfig( + const DeviceType device_type) + : impl_(new MaceEngineConfig::Impl(device_type)) {} + +MaceEngineConfig::~MaceEngineConfig() = default; + +MaceStatus MaceEngineConfig::SetGPUContext( + std::shared_ptr context) { + return impl_->SetGPUContext(context); +} + +MaceStatus MaceEngineConfig::SetGPUHints( + GPUPerfHint perf_hint, + GPUPriorityHint priority_hint) { + return impl_->SetGPUHints(perf_hint, priority_hint); +} + +MaceStatus MaceEngineConfig::SetCPUThreadPolicy( + int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp) { + return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp); +} + +MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids) { + return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids); +} + +DeviceType MaceEngineConfig::device_type() const { + return impl_->device_type(); +} + +int MaceEngineConfig::num_threads() const { + return impl_->num_threads(); +} + +std::shared_ptr MaceEngineConfig::gpu_context() const { + return impl_->gpu_context(); +} + +GPUPerfHint MaceEngineConfig::gpu_perf_hint() const { + return impl_->gpu_perf_hint(); +} + +GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const { + return impl_->gpu_priority_hint(); +} + // Mace Tensor class MaceTensor::Impl { public: @@ -155,7 +350,7 @@ std::shared_ptr MaceTensor::data() { return impl_->data; } // Mace Engine class MaceEngine::Impl { public: - explicit Impl(DeviceType device_type); + explicit Impl(const MaceEngineConfig &config); ~Impl(); @@ -178,6 +373,7 @@ class MaceEngine::Impl { size_t model_data_size_; std::shared_ptr op_registry_; DeviceType device_type_; + std::unique_ptr device_; std::unique_ptr ws_; std::unique_ptr net_; std::map input_info_map_; @@ -189,11 +385,12 @@ class MaceEngine::Impl { MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; -MaceEngine::Impl::Impl(DeviceType device_type) +MaceEngine::Impl::Impl(const MaceEngineConfig &config) : model_data_(nullptr), model_data_size_(0), op_registry_(new OperatorRegistry()), - device_type_(device_type), + device_type_(config.device_type()), + device_(nullptr), ws_(new Workspace()), net_(nullptr) #ifdef MACE_ENABLE_HEXAGON @@ -201,6 +398,19 @@ MaceEngine::Impl::Impl(DeviceType device_type) #endif { LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); + if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) { + device_.reset(new CPUDevice(config.num_threads())); + } +#ifdef MACE_ENABLE_OPENCL + if (device_type_ == DeviceType::GPU) { + device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(), + config.gpu_context()->opencl_cache_storage(), + config.gpu_priority_hint(), + config.gpu_perf_hint(), + config.gpu_context()->opencl_binary_storage(), + config.num_threads())); + } +#endif } MaceStatus MaceEngine::Impl::Init( @@ -212,7 +422,7 @@ MaceStatus MaceEngine::Impl::Init( // Check avalibility #ifdef MACE_ENABLE_OPENCL if (device_type_ == DeviceType::GPU) { - MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def)); + MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get())); } #endif // Get input and output information. @@ -230,7 +440,7 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(input_info_map_)); } ws_->CreateTensor(MakeString("mace_input_node_", input_name), - GetDeviceAllocator(device_type_), DT_FLOAT); + device_->allocator(), DT_FLOAT); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -239,7 +449,7 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(output_info_map_)); } ws_->CreateTensor(MakeString("mace_output_node_", output_name), - GetDeviceAllocator(device_type_), DT_FLOAT); + device_->allocator(), DT_FLOAT); } #ifdef MACE_ENABLE_HEXAGON if (device_type_ == HEXAGON) { @@ -255,19 +465,20 @@ MaceStatus MaceEngine::Impl::Init( } } else { #endif - MACE_RETURN_IF_ERROR(ws_->LoadModelTensor( - *net_def, device_type_, model_data)); + MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, + device_.get(), + model_data)); // Init model - auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_, + auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(), NetMode::INIT); MACE_RETURN_IF_ERROR(net->Run()); - net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_type_); + net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get()); #ifdef MACE_ENABLE_HEXAGON } #endif if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data); + ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); } return MaceStatus::MACE_SUCCESS; } @@ -360,7 +571,7 @@ MaceStatus MaceEngine::Impl::Run( #ifdef MACE_ENABLE_OPENCL if (device_type_ == GPU) { - OpenCLRuntime::Global()->SaveBuiltCLProgram(); + device_->opencl_runtime()->SaveBuiltCLProgram(); } #endif for (auto &output : *outputs) { @@ -385,8 +596,8 @@ MaceStatus MaceEngine::Impl::Run( return MACE_SUCCESS; } -MaceEngine::MaceEngine(DeviceType device_type): - impl_(new MaceEngine::Impl(device_type)) {} +MaceEngine::MaceEngine(const MaceEngineConfig &config): + impl_(new MaceEngine::Impl(config)) {} MaceEngine::~MaceEngine() = default; @@ -421,7 +632,7 @@ MaceStatus CreateMaceEngineFromProto( const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine) { LOG(INFO) << "Create MaceEngine from model pb"; // load model @@ -432,7 +643,7 @@ MaceStatus CreateMaceEngineFromProto( std::shared_ptr net_def(new NetDef()); net_def->ParseFromArray(&model_pb[0], model_pb.size()); - engine->reset(new mace::MaceEngine(device_type)); + engine->reset(new mace::MaceEngine(config)); MaceStatus status = (*engine)->Init( net_def.get(), input_nodes, output_nodes, model_data_file); diff --git a/mace/libmace/mace_runtime.cc b/mace/libmace/mace_runtime.cc deleted file mode 100644 index 24b2cd8f..00000000 --- a/mace/libmace/mace_runtime.cc +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/macros.h" -#include "mace/core/file_storage.h" -#include "mace/core/runtime/cpu/cpu_runtime.h" -#include "mace/public/mace_runtime.h" -#include "mace/utils/logging.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_runtime.h" -#endif // MACE_ENABLE_OPENCL - -namespace mace { - -class FileStorageFactory::Impl { - public: - explicit Impl(const std::string &path); - - std::unique_ptr CreateStorage(const std::string &name); - - private: - std::string path_; -}; - -FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {} - -std::unique_ptr FileStorageFactory::Impl::CreateStorage( - const std::string &name) { - return std::move(std::unique_ptr( - new FileStorage(path_ + "/" + name))); -} - -FileStorageFactory::FileStorageFactory(const std::string &path): - impl_(new FileStorageFactory::Impl(path)) {} - -FileStorageFactory::~FileStorageFactory() = default; - -std::unique_ptr FileStorageFactory::CreateStorage( - const std::string &name) { - return impl_->CreateStorage(name); -} - -extern std::shared_ptr kStorageFactory; - -void SetKVStorageFactory(std::shared_ptr storage_factory) { - VLOG(1) << "Set internal KV Storage Engine"; - kStorageFactory = storage_factory; -} - -// Set OpenCL Compiled Binary paths, just call once. (Not thread-safe) -void SetOpenCLBinaryPaths(const std::vector &paths) { -#ifdef MACE_ENABLE_OPENCL - OpenCLRuntime::ConfigureOpenCLBinaryPath(paths); -#else - MACE_UNUSED(paths); -#endif // MACE_ENABLE_OPENCL -} - -extern std::string kOpenCLParameterPath; - -void SetOpenCLParameterPath(const std::string &path) { -#ifdef MACE_ENABLE_OPENCL - kOpenCLParameterPath = path; -#else - MACE_UNUSED(path); -#endif // MACE_ENABLE_OPENCL -} - -void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { -#ifdef MACE_ENABLE_OPENCL - VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint - << ", gpu_priority_hint: " << gpu_priority_hint; - OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint); -#else - MACE_UNUSED(gpu_perf_hint); - MACE_UNUSED(gpu_priority_hint); -#endif // MACE_ENABLE_OPENCL -} - -MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp) { - VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint - << ", affinity policy: " << policy; - return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, - policy, - use_gemmlowp); -} - -MaceStatus SetOpenMPThreadAffinity(int num_threads, - const std::vector &cpu_ids) { - return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids); -} - -MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, - std::vector *little_core_ids) { - return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids); -} - - -}; // namespace mace diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds index 76d8f1c2..4bdc33db 100644 --- a/mace/libmace/mace_version_script.lds +++ b/mace/libmace/mace_version_script.lds @@ -1,15 +1,10 @@ mace { global: + *GPUContextBuilder*; + *MaceEngineConfig*; *MaceTensor*; *MaceEngine*; *CreateMaceEngineFromProto*; - *FileStorageFactory*; - *SetKVStorageFactory*; - *SetOpenCLBinaryPaths*; - *SetOpenCLParameterPath*; - *SetGPUHints*; - *SetOpenMPThreadPolicy*; - *SetOpenMPThreadAffinity*; *GetBigLittleCoreIDs*; *MaceVersion*; diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 07aad1d2..312bdc90 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -23,8 +23,25 @@ cc_library( hdrs = [ "ops_test_util.h", ], + srcs = [ + "ops_test_util.cc", + ], + copts = [ + "-Werror", + "-Wextra", + ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ + "-DMACE_ENABLE_NEON", + ]) + if_android_armv7([ + "-mfpu=neon", + ]) + if_android_armv7([ + "-mfloat-abi=softfp", + ]) + if_opencl_enabled([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), deps = [ - "//mace/core", + "//mace/ops", "@gtest", ], ) @@ -36,6 +53,7 @@ cc_library( exclude = [ "*_test.cc", "*_benchmark.cc", + "ops_test_util.cc", "buffer_to_image.cc", "image_to_buffer.cc", "lstmcell.cc", diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 8938ea74..3b48891e 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -26,9 +26,10 @@ namespace ops { template class ActivationOp : public Operator { public: - ActivationOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(kernels::StringToActivationType( + ActivationOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), static_cast( diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index cc40ac9d..49422f3a 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -58,7 +58,7 @@ void TestSimpleRelu() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -106,7 +106,7 @@ void TestUnalignedSimpleRelu() { net.RunOp(D); } - auto expected = CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); + auto expected = net.CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -159,7 +159,7 @@ void TestSimpleRelux() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -209,7 +209,7 @@ void TestSimpleReluRelux() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -267,7 +267,7 @@ void TestSimplePrelu() { } if (D == DeviceType::CPU) { - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -318,7 +318,7 @@ void TestSimpleTanh() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {-0.99999834, 0.99999834, -0.99998771, 0.99998771, -0.9999092, 0.9999092, -0.9993293, 0.9993293, -0.99505475, 0.99505475, -0.96402758, 0.96402758, @@ -371,7 +371,7 @@ void TestSimpleSigmoid() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {2, 2, 2, 2}, {9.11051194e-04, 9.99088949e-01, 2.47262316e-03, 9.97527377e-01, 6.69285092e-03, 9.93307149e-01, 1.79862100e-02, 9.82013790e-01, diff --git a/mace/ops/addn.h b/mace/ops/addn.h index 64373343..4238a013 100644 --- a/mace/ops/addn.h +++ b/mace/ops/addn.h @@ -26,8 +26,8 @@ namespace ops { template class AddNOp : public Operator { public: - AddNOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + AddNOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { Tensor *output_tensor = this->Output(0); diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 2f5aa28a..7154ad52 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -39,7 +39,7 @@ void SimpleAdd2() { // Run net.RunOp(D); - auto expected = CreateTensor({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); + auto expected = net.CreateTensor({1, 2, 3, 1}, {2, 4, 6, 8, 10, 12}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -98,7 +98,7 @@ void SimpleAdd3() { } auto expected = - CreateTensor({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); + net.CreateTensor({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-4, 1e-3); } @@ -136,8 +136,8 @@ void RandomTest() { // run on cpu net.RunOp(); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu for (int i = 0; i < input_num; ++i) { @@ -160,7 +160,7 @@ void RandomTest() { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); } } diff --git a/mace/ops/argmax.h b/mace/ops/argmax.h index ce493059..b1d7ec4e 100644 --- a/mace/ops/argmax.h +++ b/mace/ops/argmax.h @@ -26,8 +26,8 @@ namespace ops { template class ArgMaxOp : public Operator { public: - ArgMaxOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc index bf00b579..ca7ece35 100644 --- a/mace/ops/argmax_test.cc +++ b/mace/ops/argmax_test.cc @@ -47,7 +47,7 @@ void ArgMaxTest(const std::vector &input_shape, } // Check - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index 9d983f10..7221c3ca 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -25,9 +25,9 @@ namespace ops { template class BatchNormOp : public Operator { public: - BatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(false, kernels::ActivationType::NOOP, 0.0f) { + BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, false, kernels::ActivationType::NOOP, 0.0f) { epsilon_ = OperatorBase::GetOptionalArg("epsilon", static_cast(1e-4)); } @@ -52,7 +52,8 @@ class BatchNormOp : public Operator { Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - return functor_(input, scale, offset, mean, var, epsilon_, output, future); + return functor_(input, scale, offset, + mean, var, epsilon_, output, future); } private: diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index b72ec73a..7d5b77da 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -79,7 +79,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); @@ -130,8 +130,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -166,7 +166,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { @@ -208,8 +209,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -245,7 +246,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-1, 1e-2); } TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { @@ -287,8 +289,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -323,7 +325,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { @@ -365,8 +368,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -402,7 +405,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-1, 1e-2); } } // namespace test diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h index 91c4a9ba..fa1ed2c6 100644 --- a/mace/ops/batch_to_space.h +++ b/mace/ops/batch_to_space.h @@ -27,9 +27,10 @@ namespace ops { template class BatchToSpaceNDOp : public Operator { public: - BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), + BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), true) {} diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h index 901c1e74..ee3de991 100644 --- a/mace/ops/bias_add.h +++ b/mace/ops/bias_add.h @@ -24,10 +24,11 @@ namespace ops { template class BiasAddOp : public Operator { public: - BiasAddOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(static_cast(OperatorBase::GetOptionalArg( - "data_format", NHWC))) {} + BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + static_cast(OperatorBase::GetOptionalArg( + "data_format", NHWC))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index c4158454..51c8cc88 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -66,7 +66,7 @@ void BiasAddSimple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {5.5, 5.5, 7.5, 7.5, 9.5, 9.5, 11.5, 11.5, 13.5, 13.5, 15.5, 15.5}); @@ -111,8 +111,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -132,7 +132,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { @@ -167,8 +167,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -188,7 +188,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } // namespace test diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index 7c59c822..0fa34c30 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -24,9 +24,10 @@ namespace ops { template class BufferToImageOp : public Operator { public: - BufferToImageOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} + BufferToImageOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input_tensor = this->Input(INPUT); diff --git a/mace/ops/cast.h b/mace/ops/cast.h index cee022ec..56d20d52 100644 --- a/mace/ops/cast.h +++ b/mace/ops/cast.h @@ -25,8 +25,8 @@ namespace ops { template class CastOp : public Operator { public: - CastOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + CastOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { MACE_UNUSED(future); diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h index bd9234c1..a459a0b3 100644 --- a/mace/ops/channel_shuffle.h +++ b/mace/ops/channel_shuffle.h @@ -26,10 +26,10 @@ namespace ops { template class ChannelShuffleOp : public Operator { public: - ChannelShuffleOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), group_(OperatorBase::GetOptionalArg("group", 1)), - functor_(this->group_) {} + functor_(context, this->group_) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index 0b674dab..2102fe76 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -45,7 +45,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { NHWC); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 8}, {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -77,7 +77,7 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { kernels::BufferType::IN_OUT_CHANNEL); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 16}, {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31}); diff --git a/mace/ops/concat.h b/mace/ops/concat.h index be763714..94dee3d3 100644 --- a/mace/ops/concat.h +++ b/mace/ops/concat.h @@ -26,9 +26,9 @@ namespace ops { template class ConcatOp : public Operator { public: - ConcatOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 3)) {} + ConcatOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} MaceStatus Run(StatsFuture *future) override { MACE_CHECK(this->InputSize() >= 2) diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index b15045cd..5864e1ed 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -28,9 +28,10 @@ namespace ops { template class Conv2dOp : public ConvPool2dOpBase { public: - Conv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), + Conv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), + functor_(context, + this->strides_.data(), this->padding_type_, this->paddings_, this->dilations_.data(), @@ -40,7 +41,7 @@ class Conv2dOp : public ConvPool2dOpBase { OperatorBase::GetOptionalArg("max_limit", 0.0f), static_cast(OperatorBase::GetOptionalArg( "is_filter_transformed", false)), - ws->GetScratchBuffer(D)) {} + context->workspace()->GetScratchBuffer(D)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index ecfdafa2..dd338e27 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -84,7 +84,7 @@ void TestNHWCSimple3x3VALID() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {18.1f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -147,7 +147,7 @@ void TestNHWCSimple3x3SAME() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); @@ -221,7 +221,7 @@ void TestNHWCSimple3x3WithoutBias() { } // Check - auto expected = CreateTensor({1, 1, 1, 1}, {18.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {18.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -298,7 +298,7 @@ void TestNHWCCombined3x3() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -374,7 +374,7 @@ void TestFusedNHWCSimple3x3VALID() { MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } template @@ -434,7 +434,7 @@ void TestFusedNHWCSimple3x3WithoutBias() { } // Check - auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } @@ -515,7 +515,7 @@ void TestConv1x1() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 10, 2}, {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, @@ -576,8 +576,8 @@ void TestComplexConvNxNS12(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -602,7 +602,7 @@ void TestComplexConvNxNS12(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -685,8 +685,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -712,7 +712,7 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); }; @@ -837,8 +837,8 @@ void TestDilationConvNxN(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -863,7 +863,7 @@ void TestDilationConvNxN(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -934,8 +934,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -960,7 +960,7 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); }; @@ -1021,8 +1021,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -1046,7 +1046,7 @@ void TestArbitraryPadConvNxN(const std::vector &shape, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -1104,7 +1104,7 @@ void TestQuantSimple3x3() { // Run net.Run(); // Check - auto expected = CreateTensor({1, 1, 1, 1}, {230}); + auto expected = net.CreateTensor({1, 1, 1, 1}, {230}); ExpectTensorNear(*expected, *output); } diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h index 9c4860df..0a8a8c17 100644 --- a/mace/ops/conv_pool_2d_base.h +++ b/mace/ops/conv_pool_2d_base.h @@ -26,8 +26,8 @@ namespace ops { template class ConvPool2dOpBase : public Operator { public: - ConvPool2dOpBase(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), strides_(OperatorBase::GetRepeatedArgs("strides")), padding_type_(static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(SAME)))), diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index ac184c80..8eecd77d 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -21,6 +21,8 @@ namespace test { TEST(CoreTest, INIT_MODE) { std::vector op_defs; + Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU); + std::unique_ptr> tuner; Workspace ws; op_defs.emplace_back(OperatorDef()); @@ -31,7 +33,7 @@ TEST(CoreTest, INIT_MODE) { .AddIntArg("mode", static_cast(NetMode::INIT)) .Finalize(&op_defs[op_defs.size() - 1]); - Tensor *input = ws.CreateTensor("Input", GetDeviceAllocator(DeviceType::GPU), + Tensor *input = ws.CreateTensor("Input", device->allocator(), DataTypeToEnum::v()); input->Resize({1, 3, 3, 3}); { @@ -53,13 +55,13 @@ TEST(CoreTest, INIT_MODE) { } std::shared_ptr op_registry(new OperatorRegistry()); auto net = - CreateNet(op_registry, net_def, &ws, DeviceType::GPU, NetMode::INIT); + CreateNet(op_registry, net_def, &ws, device, NetMode::INIT); net->Run(); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr); - net = CreateNet(op_registry, net_def, &ws, DeviceType::GPU); + net = CreateNet(op_registry, net_def, &ws, device); net->Run(); EXPECT_TRUE(ws.GetTensor("Output") != nullptr); diff --git a/mace/ops/crop.h b/mace/ops/crop.h index f1f179b9..f5045069 100644 --- a/mace/ops/crop.h +++ b/mace/ops/crop.h @@ -26,9 +26,10 @@ namespace ops { template class CropOp : public Operator { public: - CropOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 2), + CropOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("axis", 2), OperatorBase::GetRepeatedArgs("offset")) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index a28205b9..b4bb7fdd 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -75,7 +75,7 @@ void RunCrop(const std::vector &input_shape, "Output", NHWC); } // Check - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output")); } } // namespace diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h index 188b8ba0..ec5b348e 100644 --- a/mace/ops/deconv_2d.h +++ b/mace/ops/deconv_2d.h @@ -26,9 +26,10 @@ namespace ops { template class Deconv2dOp : public Operator { public: - Deconv2dOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("strides"), + Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("strides"), static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(SAME))), OperatorBase::GetRepeatedArgs("padding_values"), diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 954d6bf4..67d0ac14 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -79,7 +79,7 @@ void RunTestSimple(const std::vector &input_shape, "Output", NHWC); } - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.0001); } @@ -350,8 +350,8 @@ void TestComplexDeconvNxNS12(const int batch, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // run on gpu BufferToImage(&net, "Input", "InputImage", @@ -377,7 +377,7 @@ void TestComplexDeconvNxNS12(const int batch, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h index 4be3f2a0..49183873 100644 --- a/mace/ops/depth_to_space.h +++ b/mace/ops/depth_to_space.h @@ -27,10 +27,10 @@ namespace ops { template class DepthToSpaceOp : public Operator { public: - DepthToSpaceOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), block_size_(OperatorBase::GetOptionalArg("block_size", 1)), - functor_(this->block_size_, true) {} + functor_(context, this->block_size_, true) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index e61590ff..99c4fb0b 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -64,7 +64,7 @@ void RunDepthToSpace(const bool d2s, ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(expected_shape, expected_data); + auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index 2762aea5..549af07a 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -29,9 +29,10 @@ namespace ops { template class DepthwiseConv2dOp : public ConvPool2dOpBase { public: - DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), - functor_(this->strides_.data(), + DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), + functor_(context, + this->strides_.data(), this->padding_type_, this->paddings_, this->dilations_.data(), diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index a2d57911..6d6b84f1 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -80,7 +80,7 @@ void SimpleValidTest() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f, 67.1f, 268.2f, 77.1f, 308.2f}); @@ -212,7 +212,7 @@ void ComplexValidTest(index_t batch, } auto expected = - CreateTensor({1, out_height, out_width, out_channels}, expect); + net.CreateTensor({1, out_height, out_width, out_channels}, expect); if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -284,8 +284,8 @@ void TestNxNS12(const index_t height, const index_t width) { "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -312,10 +312,10 @@ void TestNxNS12(const index_t height, const index_t width) { // Check if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, 1e-4); } else { - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-2, 1e-2); } }; @@ -387,7 +387,7 @@ void QuantSimpleValidTest() { net.Run(); // Check - auto expected = CreateTensor({1, 1, 1, 2}, {255, 21}); + auto expected = net.CreateTensor({1, 1, 1, 2}, {255, 21}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 161d0e4f..f7952562 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -24,9 +24,10 @@ namespace ops { template class EltwiseOp : public Operator { public: - EltwiseOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + EltwiseOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), functor_( + context, static_cast(OperatorBase::GetOptionalArg( "type", static_cast(kernels::EltwiseType::NONE))), OperatorBase::GetRepeatedArgs("coeff"), diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 55a0ce97..76b04f34 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -49,7 +49,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type, MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor({}, {output}); + auto expected = net.CreateTensor({}, {output}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -97,7 +97,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(shape, output); + auto expected = net.CreateTensor(shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -167,7 +167,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, if (input0.size() < input1.size()) { output_shape = shape1; } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -206,7 +206,7 @@ void TensorGeneralBroadcastEltwise(const kernels::EltwiseType type, MACE_NOT_IMPLEMENTED; } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace @@ -476,8 +476,8 @@ void RandomTensorScalar(const kernels::EltwiseType type, net.RunOp(DeviceType::CPU); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImg", kernels::BufferType::IN_OUT_CHANNEL); @@ -496,9 +496,9 @@ void RandomTensorScalar(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } @@ -531,8 +531,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type, net.RunOp(DeviceType::CPU); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input0", "InputImg0", kernels::BufferType::IN_OUT_CHANNEL); @@ -554,9 +554,9 @@ void RandomTensorEltwise(const kernels::EltwiseType type, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); } else { - ExpectTensorNear(expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); } } } // namespace diff --git a/mace/ops/fill.h b/mace/ops/fill.h index a8b55dbe..b6836d11 100644 --- a/mace/ops/fill.h +++ b/mace/ops/fill.h @@ -26,9 +26,9 @@ namespace ops { template class FillOp : public Operator { public: - FillOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_() {} + FillOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *shape = this->Input(SHAPE); diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h index 9cd76c73..345d87b4 100644 --- a/mace/ops/folded_batch_norm.h +++ b/mace/ops/folded_batch_norm.h @@ -26,9 +26,10 @@ namespace ops { template class FoldedBatchNormOp : public Operator { public: - FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(true, + FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + true, kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 3979583a..16a6ad68 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -83,7 +83,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125, 0.8291, 0.8291, 3.1708, 3.1708, 5.5125, 5.5125, 7.8543, 7.8543}); @@ -129,8 +129,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -153,7 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { @@ -190,8 +191,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -215,7 +216,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-2, 1e-2); } TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { @@ -252,8 +254,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -275,7 +277,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-5, 1e-4); } TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { @@ -312,8 +315,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -336,7 +339,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + 1e-2, 1e-2); } } // namespace test diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h index 8ec00391..313780cb 100644 --- a/mace/ops/fully_connected.h +++ b/mace/ops/fully_connected.h @@ -26,9 +26,9 @@ namespace ops { template class FullyConnectedOp : public Operator { public: - FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(kernels::StringToActivationType( + FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} @@ -61,7 +61,8 @@ class FullyConnectedOp : public Operator { " don't match."); } - return functor_(input, weight, bias, output, future); + return functor_(input, weight, + bias, output, future); } private: diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 8b30096d..cdeba243 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -76,7 +76,7 @@ void Simple(const std::vector &input_shape, } // Check - auto expected = CreateTensor(output_shape, output_value); + auto expected = net.CreateTensor(output_shape, output_value); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -156,8 +156,8 @@ void Random(const index_t batch, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); // Check - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); // Run on opencl BufferToImage(&net, "Input", "InputImage", @@ -181,10 +181,10 @@ void Random(const index_t batch, ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-1); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-3); } } diff --git a/mace/ops/gather.h b/mace/ops/gather.h index 37689b30..fe4026d9 100644 --- a/mace/ops/gather.h +++ b/mace/ops/gather.h @@ -24,9 +24,10 @@ namespace ops { template class GatherOp : public Operator { public: - GatherOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0), + GatherOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("axis", 0), OperatorBase::GetOptionalArg("y", 1.0)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc index 3a35b338..07a8438c 100644 --- a/mace/ops/gather_test.cc +++ b/mace/ops/gather_test.cc @@ -47,7 +47,7 @@ void TestGather(const std::vector &weight_shape, // Run net.RunOp(CPU); - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } diff --git a/mace/ops/identity.h b/mace/ops/identity.h index 7140314c..be4d75bf 100644 --- a/mace/ops/identity.h +++ b/mace/ops/identity.h @@ -25,8 +25,8 @@ namespace ops { template class IdentityOp : public Operator { public: - IdentityOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + IdentityOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index c1b9b0b8..fc259a01 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -24,9 +24,10 @@ namespace ops { template class ImageToBufferOp : public Operator { public: - ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} + ImageToBufferOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/infer_conv2d_shape.h b/mace/ops/infer_conv2d_shape.h index bc6163c1..a39f66b6 100644 --- a/mace/ops/infer_conv2d_shape.h +++ b/mace/ops/infer_conv2d_shape.h @@ -26,8 +26,8 @@ namespace ops { template class InferConv2dShapeOp : public Operator { public: - InferConv2dShapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h index d8ad1d3e..66265f19 100644 --- a/mace/ops/local_response_norm.h +++ b/mace/ops/local_response_norm.h @@ -24,8 +24,8 @@ namespace ops { template class LocalResponseNormOp : public Operator { public: - LocalResponseNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), functor_() { + LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) { depth_radius_ = OperatorBase::GetOptionalArg("depth_radius", 5); bias_ = OperatorBase::GetOptionalArg("bias", 1.0f); alpha_ = OperatorBase::GetOptionalArg("alpha", 1.0f); diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc index dc12f28a..6bb726ea 100644 --- a/mace/ops/local_response_norm_test.cc +++ b/mace/ops/local_response_norm_test.cc @@ -46,7 +46,7 @@ void Simple() { } // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 6}, {0.28, 0.28, 0.39, 0.39, 0.51, 0.51, 0.34, 0.34, 0.40, 0.40, 0.47, 0.47}); diff --git a/mace/ops/lstmcell.h b/mace/ops/lstmcell.h index 300794f2..3037c891 100644 --- a/mace/ops/lstmcell.h +++ b/mace/ops/lstmcell.h @@ -26,10 +26,12 @@ namespace ops { template class LSTMCellOp : public Operator { public: - LSTMCellOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast( - OperatorBase::GetOptionalArg("scalar_input", 0.0))) {} + LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast( + OperatorBase::GetOptionalArg("scalar_input", + 0.0))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h index e5e0dafa..ceccb939 100644 --- a/mace/ops/matmul.h +++ b/mace/ops/matmul.h @@ -24,8 +24,9 @@ namespace ops { template class MatMulOp : public Operator { public: - MatMulOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + MatMulOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context), transpose_a_(OperatorBase::GetOptionalArg("transpose_a", false)), transpose_b_(OperatorBase::GetOptionalArg("transpose_b", false)) { } @@ -46,7 +47,8 @@ class MatMulOp : public Operator { MACE_CHECK(ak == bk, "the number of A's column ", ak, " must be equal to B's row ", bk); - return functor_(A, B, C, transpose_a_, transpose_b_, future); + return functor_(A, B, C, + transpose_a_, transpose_b_, future); } private: diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index 18a9ddc8..9225b226 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -65,7 +65,7 @@ void Simple(const std::vector &A_shape, } // Check - auto expected = CreateTensor(C_shape, C_value); + auto expected = net.CreateTensor(C_shape, C_value); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -171,15 +171,15 @@ void Complex(const std::vector &batch, // Check EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape()); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); - expected.Reshape({batch_count, height, out_width}); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); + expected->Reshape({batch_count, height, out_width}); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5, 1e-5); } } diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc new file mode 100644 index 00000000..5be4cb96 --- /dev/null +++ b/mace/ops/ops_test_util.cc @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +OpTestContext *OpTestContext::Get() { + static OpTestContext instance; + return &instance; +} + +std::shared_ptr OpTestContext::gpu_context() const { + return gpu_context_; +} + +Device *OpTestContext::GetDevice(DeviceType device_type) { + return device_map_[device_type].get(); +} + +OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) { + device_map_[DeviceType::CPU] = std::unique_ptr(new CPUDevice(-1)); + device_map_[DeviceType::GPU] = std::unique_ptr( + new GPUDevice(gpu_context_->opencl_tuner(), + gpu_context_->opencl_cache_storage(), + GPUPriorityHint::PRIORITY_NORMAL)); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 2dc29241..278c3515 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -26,7 +27,8 @@ #include "gtest/gtest.h" #include "mace/core/net.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/device_context.h" +#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/kernels/opencl/common.h" @@ -110,9 +112,28 @@ class OpDefBuilder { OperatorDef op_def_; }; +class OpTestContext { + public: + static OpTestContext *Get(); + std::shared_ptr gpu_context() const; + Device *GetDevice(DeviceType device_type); + private: + OpTestContext(); + MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); + + std::shared_ptr gpu_context_; + std::map> device_map_; +}; + class OpsTestNet { public: - OpsTestNet() : op_registry_(new OperatorRegistry()) {} + OpsTestNet() : + op_registry_(new OperatorRegistry()) { + } + + ~OpsTestNet() { + Sync(); + } template void AddInputFromArray(const std::string &name, @@ -121,7 +142,8 @@ class OpsTestNet { const float scale = 0.0, const int32_t zero_point = 0) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -136,7 +158,8 @@ class OpsTestNet { const std::vector &shape, const T data) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -149,7 +172,8 @@ class OpsTestNet { bool positive = true, bool truncate = false) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -184,8 +208,10 @@ class OpsTestNet { template void Transpose2D(const std::string &src_name, const std::string &dst_name) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 2, "input shape != 2"); output->Resize({input_shape[1], input_shape[0]}); @@ -205,8 +231,10 @@ class OpsTestNet { void CopyData(const std::string &src_name, const std::string &dst_name) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); output->Resize(input_shape); @@ -222,8 +250,10 @@ class OpsTestNet { const std::string &dst_name, const DataFormat dst_format) { Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor(dst_name, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); @@ -352,8 +382,10 @@ class OpsTestNet { void FillNHWCInputToNCHWInput(const std::string &name_nchw, const std::string &name_nhwc) { Tensor *input = ws_.GetTensor(name_nhwc); - Tensor *output = ws_.CreateTensor(name_nchw, GetDeviceAllocator(D), - DataTypeToEnum::v()); + Tensor *output = ws_.CreateTensor( + name_nchw, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v()); const std::vector input_shape = input->shape(); index_t batch = input_shape[0]; index_t height = input_shape[1]; @@ -374,6 +406,22 @@ class OpsTestNet { } } + // Create standalone tensor on device D with T type. + template + std::unique_ptr CreateTensor( + const std::vector &shape = {}, + const std::vector &data = {}) { + std::unique_ptr res( + new Tensor(OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v())); + if (!data.empty()) { + res->Resize(shape); + T *input_data = res->mutable_data(); + memcpy(input_data, data.data(), data.size() * sizeof(T)); + } + return res; + } + OperatorDef *NewOperatorDef() { op_defs_.clear(); op_defs_.emplace_back(OperatorDef()); @@ -392,8 +440,9 @@ class OpsTestNet { for (auto &op_def_ : op_defs_) { net_def.add_op()->CopyFrom(op_def_); } - net_ = CreateNet(op_registry_, net_def, &ws_, device); - device_ = device; + net_ = CreateNet(op_registry_, net_def, &ws_, + OpTestContext::Get()->GetDevice(device)); + device_type_ = device; return net_ != nullptr; } @@ -416,10 +465,15 @@ class OpsTestNet { MaceStatus RunOp() { return RunOp(DeviceType::CPU); } MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { - device_ = device; - net_ = CreateNet(op_registry_, net_def, &ws_, device, NetMode::INIT); + device_type_ = device; + net_ = CreateNet(op_registry_, + net_def, + &ws_, + OpTestContext::Get()->GetDevice(device), + NetMode::INIT); MACE_RETURN_IF_ERROR(net_->Run()); - net_ = CreateNet(op_registry_, net_def, &ws_, device); + net_ = CreateNet(op_registry_, net_def, &ws_, + OpTestContext::Get()->GetDevice(device)); return net_->Run(); } @@ -432,9 +486,12 @@ class OpsTestNet { } void Sync() { - if (net_ && device_ == DeviceType::GPU) { - OpenCLRuntime::Global()->command_queue().finish(); +#ifdef MACE_ENABLE_OPENCL + if (net_ && device_type_ == DeviceType::GPU) { + OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime() + ->command_queue().finish(); } +#endif } public: @@ -442,17 +499,17 @@ class OpsTestNet { Workspace ws_; std::vector op_defs_; std::unique_ptr net_; - DeviceType device_; + DeviceType device_type_; }; class OpsTestBase : public ::testing::Test { protected: virtual void SetUp() { - // OpenCLRuntime::CreateGlobal(); + SetOpenMPThreadsAndAffinityPolicy(-1, + CPUAffinityPolicy::AFFINITY_BIG_ONLY); } virtual void TearDown() { - // OpenCLRuntime::DestroyGlobal(); } }; @@ -510,17 +567,6 @@ std::vector VectorStaticCast(const std::vector &&src) { return std::move(dest); } -template -std::unique_ptr CreateTensor(const std::vector &shape, - const std::vector &data) { - std::unique_ptr res( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v())); - res->Resize(shape); - T *input_data = res->mutable_data(); - memcpy(input_data, data.data(), data.size() * sizeof(T)); - return res; -} - inline bool IsSameSize(const Tensor &x, const Tensor &y) { if (x.dim_size() != y.dim_size()) return false; for (int d = 0; d < x.dim_size(); ++d) { diff --git a/mace/ops/pad.h b/mace/ops/pad.h index 98677109..6a7ce102 100644 --- a/mace/ops/pad.h +++ b/mace/ops/pad.h @@ -26,9 +26,10 @@ namespace ops { template class PadOp : public Operator { public: - PadOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("paddings"), + PadOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("paddings"), OperatorBase::GetOptionalArg("constant_value", 0.0)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index 2f4a9721..3a68248e 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -63,7 +63,7 @@ void Simple() { auto output = net.GetTensor("Output"); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 5, 6, 1}, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 2, 2, 2, 1.0, 1.0, 1.0, 1.0, @@ -99,7 +99,7 @@ TEST_F(PadTest, ComplexCPU) { auto output = net.GetTensor("Output"); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 3, 3, 4}, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, @@ -134,8 +134,8 @@ void Complex(const std::vector &input_shape, net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -155,9 +155,9 @@ void Complex(const std::vector &input_shape, auto output = net.GetTensor("OpenCLOutput"); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *output, 1e-2, 1e-2); + ExpectTensorNear(*expected, *output, 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *output, 1e-5); + ExpectTensorNear(*expected, *output, 1e-5); } } } // namespace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index fac4e1dd..3d1753b3 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -27,13 +27,14 @@ namespace ops { template class PoolingOp : public ConvPool2dOpBase { public: - PoolingOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws), + PoolingOp(const OperatorDef &op_def, OpKernelContext *context) + : ConvPool2dOpBase(op_def, context), kernels_(OperatorBase::GetRepeatedArgs("kernels")), pooling_type_( static_cast(OperatorBase::GetOptionalArg( "pooling_type", static_cast(AVG)))), - functor_(pooling_type_, + functor_(context, + pooling_type_, kernels_.data(), this->strides_.data(), this->padding_type_, diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 72a4fdee..2f02d729 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -57,7 +57,7 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); + net.CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -90,7 +90,7 @@ TEST_F(PoolingOpTest, MAX_SAME) { NHWC); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -124,7 +124,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { NHWC); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -158,7 +158,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { NHWC); // Check - auto expected = CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); + auto expected = net.CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -209,7 +209,7 @@ void SimpleMaxPooling3S2() { } // Check - auto expected = CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); + auto expected = net.CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -249,8 +249,8 @@ void MaxPooling3S2(const std::vector &input_shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -269,10 +269,10 @@ void MaxPooling3S2(const std::vector &input_shape, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } } // namespace @@ -334,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) { NHWC); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -368,7 +368,7 @@ void SimpleAvgPoolingTest() { kernels::BufferType::IN_OUT_CHANNEL); // Check - auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); + auto expected = net.CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -407,8 +407,8 @@ void AvgPoolingTest(const std::vector &shape, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -427,10 +427,10 @@ void AvgPoolingTest(const std::vector &shape, kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, 1e-3); } else { - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } } // namespace @@ -503,7 +503,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_VALID) { // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); + net.CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -531,7 +531,7 @@ TEST_F(PoolingOpTest, QUANT_MAX_SAME) { net.RunOp(); // Check - auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); + auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -561,7 +561,7 @@ TEST_F(PoolingOpTest, QUANT_AVG_VALID) { net.RunOp(); // Check - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 2, 2, 2}, {3, 19, 5, 21, 11, 27, 13, 29}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h index 1afabb8f..d879e240 100644 --- a/mace/ops/proposal.h +++ b/mace/ops/proposal.h @@ -24,9 +24,10 @@ namespace ops { template class ProposalOp : public Operator { public: - ProposalOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("min_size", 16), + ProposalOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("min_size", 16), OperatorBase::GetOptionalArg("nms_thresh", 0.7), OperatorBase::GetOptionalArg("pre_nms_top_n", 6000), OperatorBase::GetOptionalArg("post_nms_top_n", 300), diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc index c5b71ad2..e8b2ae5a 100644 --- a/mace/ops/proposal_test.cc +++ b/mace/ops/proposal_test.cc @@ -60,7 +60,8 @@ TEST_F(ProposalOpTest, CPUSimple) { // Run net.RunOp(); - auto expected_tensor = CreateTensor({1, 1, 1, 5}, {0, 0, 0, 255, 255}); + auto expected_tensor = net.CreateTensor({1, 1, 1, 5}, + {0, 0, 0, 255, 255}); ExpectTensorNear(*expected_tensor, *net.GetTensor("Output"), 1e-5); } diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h index eb78489b..2e7a77c2 100644 --- a/mace/ops/quantize.h +++ b/mace/ops/quantize.h @@ -24,8 +24,9 @@ namespace ops { template class QuantizeOp : public Operator { public: - QuantizeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context), non_zero_( static_cast(OperatorBase::GetOptionalArg("non_zero", 0))) {} @@ -50,8 +51,8 @@ class QuantizeOp : public Operator { template class DequantizeOp : public Operator { public: - DequantizeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/reduce_mean.h b/mace/ops/reduce_mean.h index 7cdaff86..0ef9c102 100644 --- a/mace/ops/reduce_mean.h +++ b/mace/ops/reduce_mean.h @@ -27,9 +27,10 @@ namespace ops { template class ReduceMeanOp : public Operator { public: - ReduceMeanOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("axis"), + ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("axis"), OperatorBase::GetOptionalArg("keepdims", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc index 4f5a029e..2b1875de 100644 --- a/mace/ops/reduce_mean_test.cc +++ b/mace/ops/reduce_mean_test.cc @@ -57,7 +57,7 @@ void Simple(const std::vector &input_shape, ImageToBuffer(&net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL); } - auto expected = CreateTensor(output_shape, output); + auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); } diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h index c47e6cb1..86476de0 100644 --- a/mace/ops/reshape.h +++ b/mace/ops/reshape.h @@ -26,8 +26,8 @@ namespace ops { template class ReshapeOp : public Operator { public: - ReshapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + ReshapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h index a83f3a31..23b4c116 100644 --- a/mace/ops/resize_bicubic.h +++ b/mace/ops/resize_bicubic.h @@ -24,9 +24,10 @@ namespace ops { template class ResizeBicubicOp : public Operator { public: - ResizeBicubicOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("size", {-1, -1}), + ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("size", {-1, -1}), OperatorBase::GetOptionalArg("align_corners", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 696e0f29..a3ad96f4 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -48,7 +48,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } @@ -77,7 +77,7 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { NHWC); // Check - auto expected = CreateTensor({1, 2, 3, 3}, + auto expected = net.CreateTensor({1, 2, 3, 3}, {0., 1., 2., 4.110297, 5.110297, 6.110297, 8.223037, 9.223036, 10.223037, 24., 25., 26., 28.110298, 29.1103, 30.110298, 32.223038, 33.223038, 34.223038}); @@ -110,7 +110,7 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h index fb389859..f328a9a4 100644 --- a/mace/ops/resize_bilinear.h +++ b/mace/ops/resize_bilinear.h @@ -24,9 +24,10 @@ namespace ops { template class ResizeBilinearOp : public Operator { public: - ResizeBilinearOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetRepeatedArgs("size", {-1, -1}), + ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("size", {-1, -1}), OperatorBase::GetOptionalArg("align_corners", false)) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 49dda888..5d284f86 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -48,7 +48,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -78,7 +78,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { NHWC); // Check - auto expected = CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); + auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -116,8 +116,8 @@ void TestRandomResizeBilinear() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", @@ -136,7 +136,7 @@ void TestRandomResizeBilinear() { kernels::BufferType::IN_OUT_CHANNEL); } // Check - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, 1e-6); } } diff --git a/mace/ops/scalar_math.h b/mace/ops/scalar_math.h index 29cb478c..356c9371 100644 --- a/mace/ops/scalar_math.h +++ b/mace/ops/scalar_math.h @@ -26,9 +26,10 @@ namespace ops { template class ScalarMathOp : public Operator { public: - ScalarMathOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast( + ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast( OperatorBase::GetOptionalArg( "type", static_cast(kernels::EltwiseType::NONE))), OperatorBase::GetRepeatedArgs("coeff"), diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc index 32b9db00..0d34b80a 100644 --- a/mace/ops/scalar_math_test.cc +++ b/mace/ops/scalar_math_test.cc @@ -49,60 +49,60 @@ void ScalarMathTest(const kernels::EltwiseType type, net.RunOp(D); - auto expected = CreateTensor({}, {output}); + auto expected = net.CreateTensor({}, {output}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace TEST_F(ScalarMathOpTest, SimpleCPU) { -ScalarMathTest( - kernels::EltwiseType::SUM, 1, 2, 3, 3); -ScalarMathTest( - kernels::EltwiseType::SUB, 1, 2, 3, -1); -ScalarMathTest( - kernels::EltwiseType::PROD, 3, -2, 3, -6); -ScalarMathTest( - kernels::EltwiseType::DIV, 3, -2, 1, -1.5); -ScalarMathTest( - kernels::EltwiseType::MIN, 3, -2, 1, -2); -ScalarMathTest( - kernels::EltwiseType::MAX, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::NEG, 3, -2, 1, -3); -ScalarMathTest( - kernels::EltwiseType::ABS, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); -ScalarMathTest( - kernels::EltwiseType::POW, 3, 1, 1, 3); -ScalarMathTest( - kernels::EltwiseType::EQUAL, 3, 3, 1, 1); + ScalarMathTest( + kernels::EltwiseType::SUM, 1, 2, 3, 3); + ScalarMathTest( + kernels::EltwiseType::SUB, 1, 2, 3, -1); + ScalarMathTest( + kernels::EltwiseType::PROD, 3, -2, 3, -6); + ScalarMathTest( + kernels::EltwiseType::DIV, 3, -2, 1, -1.5); + ScalarMathTest( + kernels::EltwiseType::MIN, 3, -2, 1, -2); + ScalarMathTest( + kernels::EltwiseType::MAX, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::NEG, 3, -2, 1, -3); + ScalarMathTest( + kernels::EltwiseType::ABS, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); + ScalarMathTest( + kernels::EltwiseType::POW, 3, 1, 1, 3); + ScalarMathTest( + kernels::EltwiseType::EQUAL, 3, 3, 1, 1); } TEST_F(ScalarMathOpTest, SimpleGPU) { -ScalarMathTest( - kernels::EltwiseType::SUM, 1, 2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SUB, 1, 2, 1, -1); -ScalarMathTest( - kernels::EltwiseType::PROD, 3, -2, 1, -6); -ScalarMathTest( - kernels::EltwiseType::DIV, 3, -2, 1, -1.5); -ScalarMathTest( - kernels::EltwiseType::MIN, 3, -2, 1, -2); -ScalarMathTest( - kernels::EltwiseType::MAX, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::NEG, 3, -2, 1, -3); -ScalarMathTest( - kernels::EltwiseType::ABS, 3, -2, 1, 3); -ScalarMathTest( - kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); -ScalarMathTest( - kernels::EltwiseType::POW, 3, 1, 1, 3); -ScalarMathTest( - kernels::EltwiseType::EQUAL, 3, 3, 1, 1); + ScalarMathTest( + kernels::EltwiseType::SUM, 1, 2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SUB, 1, 2, 1, -1); + ScalarMathTest( + kernels::EltwiseType::PROD, 3, -2, 1, -6); + ScalarMathTest( + kernels::EltwiseType::DIV, 3, -2, 1, -1.5); + ScalarMathTest( + kernels::EltwiseType::MIN, 3, -2, 1, -2); + ScalarMathTest( + kernels::EltwiseType::MAX, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::NEG, 3, -2, 1, -3); + ScalarMathTest( + kernels::EltwiseType::ABS, 3, -2, 1, 3); + ScalarMathTest( + kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25); + ScalarMathTest( + kernels::EltwiseType::POW, 3, 1, 1, 3); + ScalarMathTest( + kernels::EltwiseType::EQUAL, 3, 3, 1, 1); } } // namespace test } // namespace ops diff --git a/mace/ops/shape.h b/mace/ops/shape.h index 98f139e4..abb9ffb3 100644 --- a/mace/ops/shape.h +++ b/mace/ops/shape.h @@ -25,8 +25,8 @@ namespace ops { template class ShapeOp : public Operator { public: - ShapeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + ShapeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h index 0a6868f0..047402f0 100644 --- a/mace/ops/softmax.h +++ b/mace/ops/softmax.h @@ -24,8 +24,9 @@ namespace ops { template class SoftmaxOp : public Operator { public: - SoftmaxOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws) {} + SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context) {} MaceStatus Run(StatsFuture *future) override { const Tensor *logits = this->Input(LOGITS); diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 827067f4..012424c5 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -29,7 +29,7 @@ void Simple() { // Add input data net.AddInputFromArray("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4}); - auto expected = CreateTensor( + auto expected = net.CreateTensor( {1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); @@ -113,8 +113,8 @@ void Complex(const std::vector &logits_shape) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("Output")); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -131,7 +131,7 @@ void Complex(const std::vector &logits_shape) { ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); } } // namespace diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h index 7ce0dd13..170bde09 100644 --- a/mace/ops/space_to_batch.h +++ b/mace/ops/space_to_batch.h @@ -27,9 +27,10 @@ namespace ops { template class SpaceToBatchNDOp : public Operator { public: - SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), + SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), OperatorBase::GetRepeatedArgs("block_shape", {1, 1}), false) {} diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 5539bfd6..8a3c35fe 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -116,24 +116,23 @@ void TestBidirectionalTransform(const std::vector &space_shape, const std::vector &padding_data, const std::vector &batch_shape, const std::vector &batch_data) { - auto space_tensor = std::unique_ptr( - new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); + OpsTestNet net; + auto space_tensor = net.CreateTensor(); space_tensor->Resize(space_shape); { Tensor::MappingGuard space_mapper(space_tensor.get()); - T *space_ptr = space_tensor->mutable_data(); + T *space_ptr = space_tensor->template mutable_data(); MACE_CHECK(static_cast(space_tensor->size()) == space_data.size()) << "Space tensor size:" << space_tensor->size() << ", space data size:" << space_data.size(); memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T)); } - auto batch_tensor = std::unique_ptr( - new Tensor(GetDeviceAllocator(DeviceType::GPU), DataTypeToEnum::v())); + auto batch_tensor = net.CreateTensor(); batch_tensor->Resize(batch_shape); { Tensor::MappingGuard batch_mapper(batch_tensor.get()); - T *batch_ptr = batch_tensor->mutable_data(); + T *batch_ptr = batch_tensor->template mutable_data(); MACE_CHECK(static_cast(batch_tensor->size()) == batch_data.size()); memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(T)); } diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h index 44ca7e5c..75dd27ed 100644 --- a/mace/ops/space_to_depth.h +++ b/mace/ops/space_to_depth.h @@ -27,9 +27,11 @@ namespace ops { template class SpaceToDepthOp : public Operator { public: - SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("block_size", 1), false) {} + SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + OperatorBase::GetOptionalArg("block_size", 1), + false) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/split.h b/mace/ops/split.h index 710cdfb3..aa41aa15 100644 --- a/mace/ops/split.h +++ b/mace/ops/split.h @@ -26,9 +26,9 @@ namespace ops { template class SplitOp : public Operator { public: - SplitOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 3)) {} + SplitOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} MaceStatus Run(StatsFuture *future) override { MACE_CHECK(this->OutputSize() >= 2) diff --git a/mace/ops/squeeze.h b/mace/ops/squeeze.h index 35b2aed4..7febfb0e 100644 --- a/mace/ops/squeeze.h +++ b/mace/ops/squeeze.h @@ -26,8 +26,8 @@ namespace ops { template class SqueezeOp : public Operator { public: - SqueezeOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), + SqueezeOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), axis_(OperatorBase::GetRepeatedArgs("axis", {})) {} MaceStatus Run(StatsFuture *future) override { diff --git a/mace/ops/stack.h b/mace/ops/stack.h index 17210fb2..be25c0b0 100644 --- a/mace/ops/stack.h +++ b/mace/ops/stack.h @@ -26,9 +26,9 @@ namespace ops { template class StackOp : public Operator { public: - StackOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0)) {} + StackOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} MaceStatus Run(StatsFuture *future) override { const std::vector &inputs = this->Inputs(); diff --git a/mace/ops/strided_slice.h b/mace/ops/strided_slice.h index 57653359..249dc3e9 100644 --- a/mace/ops/strided_slice.h +++ b/mace/ops/strided_slice.h @@ -24,9 +24,10 @@ namespace ops { template class StridedSliceOp : public Operator { public: - StridedSliceOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("begin_mask", 0), + StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, + OperatorBase::GetOptionalArg("begin_mask", 0), OperatorBase::GetOptionalArg("end_mask", 0), OperatorBase::GetOptionalArg("ellipsis_mask", 0), OperatorBase::GetOptionalArg("new_axis_mask", 0), diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h index 1ad73db9..91aa3365 100644 --- a/mace/ops/transpose.h +++ b/mace/ops/transpose.h @@ -26,10 +26,10 @@ namespace mace { template class TransposeOp : public Operator { public: - TransposeOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), + TransposeOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), dims_(OperatorBase::GetRepeatedArgs("dims")), - functor_(dims_) {} + functor_(context, dims_) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/unstack.h b/mace/ops/unstack.h index 1f743bd5..1c3d1764 100644 --- a/mace/ops/unstack.h +++ b/mace/ops/unstack.h @@ -26,9 +26,9 @@ namespace ops { template class UnstackOp : public Operator { public: - UnstackOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), - functor_(OperatorBase::GetOptionalArg("axis", 0)) {} + UnstackOp(const OperatorDef &operator_def, OpKernelContext *context) + : Operator(operator_def, context), + functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 2406a361..3cd5ab92 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -64,9 +64,10 @@ void WinogradConvolution(const index_t batch, // Transfer output ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); - Tensor expected; - expected.Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected.shape(); + + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("ConvOutput")); + auto output_shape = expected->shape(); // Winograd convolution // transform filter @@ -124,9 +125,11 @@ void WinogradConvolution(const index_t batch, ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-5, 1e-4); } } } // namespace @@ -212,9 +215,9 @@ void WinogradConvolutionWithPad(const index_t batch, // Transfer output ImageToBuffer(&net, "OutputImage", "ConvOutput", kernels::BufferType::IN_OUT_CHANNEL); - Tensor expected; - expected.Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected.shape(); + auto expected = net.CreateTensor(); + expected->Copy(*net.GetOutput("ConvOutput")); + auto output_shape = expected->shape(); // Winograd convolution // transform filter @@ -272,9 +275,11 @@ void WinogradConvolutionWithPad(const index_t batch, ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", kernels::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-2, 1e-2); } else { - ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-5, 1e-4); + ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), + 1e-5, 1e-4); } } } // namespace diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h index 0349de8a..548c889a 100644 --- a/mace/ops/winograd_inverse_transform.h +++ b/mace/ops/winograd_inverse_transform.h @@ -29,9 +29,11 @@ namespace ops { template class WinogradInverseTransformOp : public Operator { public: - WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(kernels::StringToActivationType( + WinogradInverseTransformOp(const OperatorDef &op_def, + OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), OperatorBase::GetOptionalArg("max_limit", 0.0f), diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h index db874287..2274b6e8 100644 --- a/mace/ops/winograd_transform.h +++ b/mace/ops/winograd_transform.h @@ -26,9 +26,10 @@ namespace ops { template class WinogradTransformOp : public Operator { public: - WinogradTransformOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_(static_cast(OperatorBase::GetOptionalArg( + WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context) + : Operator(op_def, context), + functor_(context, + static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(VALID))), OperatorBase::GetRepeatedArgs("padding_values"), OperatorBase::GetOptionalArg( diff --git a/mace/public/BUILD b/mace/public/BUILD index 3669d595..b434312b 100644 --- a/mace/public/BUILD +++ b/mace/public/BUILD @@ -11,7 +11,6 @@ cc_library( name = "public", hdrs = [ "mace.h", - "mace_runtime.h", ], copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], ) diff --git a/mace/public/mace.h b/mace/public/mace.h index f6116348..0b743423 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -24,12 +24,36 @@ #include #include +#ifndef MACE_API +#define MACE_API __attribute__((visibility("default"))) +#endif + namespace mace { class NetDef; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; +enum GPUPerfHint { + PERF_DEFAULT = 0, + PERF_LOW = 1, + PERF_NORMAL = 2, + PERF_HIGH = 3 +}; + +enum GPUPriorityHint { + PRIORITY_DEFAULT = 0, + PRIORITY_LOW = 1, + PRIORITY_NORMAL = 2, + PRIORITY_HIGH = 3 +}; + +enum CPUAffinityPolicy { + AFFINITY_NONE = 0, + AFFINITY_BIG_ONLY = 1, + AFFINITY_LITTLE_ONLY = 2, +}; + struct CallStats { int64_t start_micros; int64_t end_micros; @@ -73,14 +97,167 @@ enum MaceStatus { } \ } +/// \brief Get ARM big.LITTLE configuration. +/// +/// This function will detect the max frequencies of all CPU cores, and assume +/// the cores with largest max frequencies as big cores, and all the remaining +/// cores as little. If all cpu core's max frequencies equals, big_core_ids and +/// little_core_ids will both be filled with all cpu core ids. +/// +/// \param [out] big_core_ids +/// \param [out] little_core_ids +/// \return If successful, it returns MACE_SUCCESS and error if it can't +/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK). + +MACE_API MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids); + +/// \brief GPU context contain the status used for GPU device. +/// +/// The life cycle of GPUContext object is the same as MaceEngines use it. +/// Just use one GPUContext for all MaceEngines, which will speed up the +/// initialization procedure. There are some data in common between different +/// MaceEngines using GPU, use one GPUContext could avoid duplication. +class GPUContext; + +/// \brief GPUContext builder. +/// +/// Use the GPUContextBuilder to generate GPUContext. +/// Not thread-safe +class MACE_API GPUContextBuilder { + public: + GPUContextBuilder(); + ~GPUContextBuilder(); + GPUContextBuilder(const GPUContextBuilder &) = delete; + GPUContextBuilder(const GPUContextBuilder &&) = delete; + GPUContextBuilder &operator=(const GPUContextBuilder &) = delete; + GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete; + + /// \brief Set internal storage factory to store internal data. + /// + /// Now the path is used to store the built OpenCL binaries to file, + /// which could speed up the GPU initialization and first run. + /// If do not call this API, the initialization maybe slow for GPU. + /// + /// \param path Make sure your program have Read/Write permission of the path + /// \return + GPUContextBuilder &SetStoragePath(const std::string &path); + /// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length) + /// + /// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length) + /// OpenCL binary is corresponding to the OpenCL Driver version, + /// you should update the binary when OpenCL Driver changed. + /// + /// \param paths MACE will use first file found in all paths + /// \return + GPUContextBuilder &SetOpenCLBinaryPaths( + const std::vector &paths); + /// \brief Set the path of Generated OpenCL parameter file + /// + /// If you use gpu for specific soc, The parameters is the local work group + /// size tuned for specific SOC, which may be faster than the + /// general parameters. + /// + /// \param path Make sure your program have Read/Write permission of the path + /// \return + GPUContextBuilder &SetOpenCLParameterPath(const std::string &path); + + std::shared_ptr Finalize(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +class MACE_API MaceEngineConfig { + public: + explicit MaceEngineConfig(const DeviceType device_type); + ~MaceEngineConfig(); + MaceEngineConfig(const MaceEngineConfig &) = delete; + MaceEngineConfig(const MaceEngineConfig &&) = delete; + MaceEngineConfig &operator=(const MaceEngineConfig &) = delete; + MaceEngineConfig &operator=(const MaceEngineConfig &&) = delete; + + /// \brief Set GPUContext + /// + /// Just use one GPUContext for multiple models run on GPU. + /// \param context created use GPUContextBuilder + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetGPUContext(std::shared_ptr context); + + /// \brief Set GPU hints, currently only supports Adreno GPU. + /// + /// Caution: this function may hurt performance + /// if improper parameters provided. + /// + /// \param perf_hint performance hint + /// \param priority_hint priority hint + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetGPUHints(GPUPerfHint perf_hint, + GPUPriorityHint priority_hint); + + /// \brief Set CPU threads number and affinity policy. + /// + /// Caution: this function may hurt performance if improper + /// parameters provided. When num_threads_hint is zero or negative, + /// the function will set the threads number equaling to the number of + /// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all + /// (AFFINITY_NONE) cores according to the policy. The threads number will + /// also be truncated to the corresponding cores number when num_threads_hint + /// is larger than it. + /// The OpenMP threads will be bind to (via sched_setaffinity) big cores + /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). + /// + /// \param num_threads_hint it is only a hint. + /// \param policy one of CPUAffinityPolicy + /// \param status MACE_SUCCESS for successful, or it can't reliabley + /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's + /// suggested to use AFFINITY_NONE to use all cores. + /// \param use_gemmlowp use gemmlowp for quantized inference + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetCPUThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy, + bool use_gemmlowp = false); + + /// \brief Set OpenMP threads number and processor affinity. + /// + /// Caution: this function may hurt performance + /// if improper parameters provided. + /// This function may not work well on some chips (e.g. MTK). Setting thread + /// affinity to offline cores may run very slow or unexpectedly. + /// In such cases, please use SetOpenMPThreadPolicy with default policy + /// instead. + /// + /// \param num_threads + /// \param cpu_ids + /// \return MACE_SUCCESS for success, other for failed. + MaceStatus SetOpenMPThreadAffinity( + int num_threads, + const std::vector &cpu_ids); + + DeviceType device_type() const; + + int num_threads() const; + + std::shared_ptr gpu_context() const; + + GPUPriorityHint gpu_priority_hint() const; + + GPUPerfHint gpu_perf_hint() const; + + private: + class Impl; + std::unique_ptr impl_; +}; + // MACE input/output tensor -class __attribute__((visibility("default"))) MaceTensor { +class MACE_API MaceTensor { public: // shape - the shape of the tensor, with size n // data - the buffer of the tensor, must not be null with size equals // shape[0] * shape[1] * ... * shape[n-1] - explicit MaceTensor(const std::vector &shape, - std::shared_ptr data); + MaceTensor(const std::vector &shape, + std::shared_ptr data); MaceTensor(); MaceTensor(const MaceTensor &other); MaceTensor(const MaceTensor &&other); @@ -97,9 +274,9 @@ class __attribute__((visibility("default"))) MaceTensor { std::unique_ptr impl_; }; -class __attribute__((visibility("default"))) MaceEngine { +class MACE_API MaceEngine { public: - explicit MaceEngine(DeviceType device_type); + explicit MaceEngine(const MaceEngineConfig &config); ~MaceEngine(); MaceStatus Init(const NetDef *net_def, @@ -135,18 +312,16 @@ class __attribute__((visibility("default"))) MaceEngine { /// \param model_data_file[in]: the path of model data file /// \param input_nodes[in]: the array of input nodes' name /// \param output_nodes[in]: the array of output nodes' name -/// \param device_type[in]: one of [CPU, GPU, HEXAGON], -/// based on the runtime type of your model deployment file. +/// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// MACE_OUT_OF_RESOURCES for resources is out of range. -__attribute__((visibility("default"))) -MaceStatus CreateMaceEngineFromProto( +MACE_API MaceStatus CreateMaceEngineFromProto( const std::vector &model_pb, const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine); } // namespace mace diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h deleted file mode 100644 index 4cd60d2b..00000000 --- a/mace/public/mace_runtime.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file defines runtime tuning APIs. -// These APIs are not stable. - -#ifndef MACE_PUBLIC_MACE_RUNTIME_H_ -#define MACE_PUBLIC_MACE_RUNTIME_H_ - -#include -#include -#include -#include - -#include "mace/public/mace.h" - -namespace mace { - -enum GPUPerfHint { - PERF_DEFAULT = 0, - PERF_LOW = 1, - PERF_NORMAL = 2, - PERF_HIGH = 3 -}; - -enum GPUPriorityHint { - PRIORITY_DEFAULT = 0, - PRIORITY_LOW = 1, - PRIORITY_NORMAL = 2, - PRIORITY_HIGH = 3 -}; - -enum CPUAffinityPolicy { - AFFINITY_NONE = 0, - AFFINITY_BIG_ONLY = 1, - AFFINITY_LITTLE_ONLY = 2, -}; - -class KVStorage { - public: - // return: 0 for success, -1 for error - virtual int Load() = 0; - virtual void Clear() = 0; - virtual bool Insert(const std::string &key, - const std::vector &value) = 0; - virtual const std::vector *Find(const std::string &key) = 0; - // return: 0 for success, -1 for error - virtual int Flush() = 0; - virtual ~KVStorage() {} -}; - -class KVStorageFactory { - public: - virtual std::unique_ptr CreateStorage(const std::string &name) = 0; -}; - -class __attribute__((visibility("default"))) FileStorageFactory - : public KVStorageFactory { - public: - // You have to make sure your APP have read and write permission of the path. - explicit FileStorageFactory(const std::string &path); - - ~FileStorageFactory(); - - std::unique_ptr CreateStorage(const std::string &name) override; - - private: - class Impl; - std::unique_ptr impl_; -}; - -/// \brief Set internal storage factory to store internal data. (Call once) -/// -/// Now the path is used to store the built OpenCL binaries to file, -/// which could speed up the GPU initialization and first run. -/// If do not call this API, the initialization maybe slow for GPU. -/// -/// \param path Make sure your program have Read/Write permission of the path -/// \return -__attribute__((visibility("default"))) -void SetKVStorageFactory(std::shared_ptr storage_factory); - -/// \brief Set paths of Generated OpenCL Compiled Kernel Binary file (not libOpenCL.so) // NOLINT(whitespace/line_length) -/// -/// Just call once. (Not thread-safe) -/// if you use gpu of specific soc, Using OpenCL binary will speed up the initialization. // NOLINT(whitespace/line_length) -/// OpenCL binary is corresponding to the OpenCL Driver version, -/// you should update the binary when OpenCL Driver changed. -/// -/// \param paths MACE will use first file found in all paths -/// \return -__attribute__((visibility("default"))) -void SetOpenCLBinaryPaths(const std::vector &paths); - -/// \brief Set the path of Generated OpenCL parameter file -/// -/// Just call once. (Not thread-safe) -/// If you use gpu for specific soc, The parameters is the local work group -/// size tuned for specific SOC, which may be faster than the -/// general parameters. -/// -/// \param path Make sure your program have Read/Write permission of the path -/// \return -__attribute__((visibility("default"))) -void SetOpenCLParameterPath(const std::string &path); - -/// \brief Set GPU hints, currently only supports Adreno GPU. -/// -/// Caution: this function may hurt performance -/// if improper parameters provided. -/// -/// \param perf_hint performance hint -/// \param priority_hint priority hint -/// \return -__attribute__((visibility("default"))) -void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); - -/// \brief Set OpenMP threads number and affinity policy. -/// -/// Caution: this function may hurt performance if improper parameters provided. -/// When num_threads_hint is zero or negative, -/// the function will set the threads number equaling to the number of -/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all -/// (AFFINITY_NONE) cores according to the policy. The threads number will -/// also be truncated to the corresponding cores number when num_threads_hint -/// is larger than it. -/// The OpenMP threads will be bind to (via sched_setaffinity) big cores -/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY). -/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for -/// quantized inference. -/// -/// \param num_threads_hint it is only a hint. -/// \param policy one of CPUAffinityPolicy -/// \param use_gemmlowp use gemmlowp for quantized inference -/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE -/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use -/// AFFINITY_NONE to use all cores. -__attribute__((visibility("default"))) -MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy, - bool use_gemmlowp = false); - -/// \brief Set OpenMP threads number and processor affinity. -/// -/// Caution: this function may hurt performance -/// if improper parameters provided. -/// This function may not work well on some chips (e.g. MTK). Setting thread -/// affinity to offline cores may run very slow or unexpectedly. -/// In such cases, please use SetOpenMPThreadPolicy with default policy -/// instead. -/// -/// \param num_threads -/// \param cpu_ids -/// \return -__attribute__((visibility("default"))) -MaceStatus SetOpenMPThreadAffinity(int num_threads, - const std::vector &cpu_ids); - -/// \brief Get ARM big.LITTLE configuration. -/// -/// This function will detect the max frequencies of all CPU cores, and assume -/// the cores with largest max frequencies as big cores, and all the remaining -/// cores as little. If all cpu core's max frequencies equals, big_core_ids and -/// little_core_ids will both be filled with all cpu core ids. -/// -/// \param [out] big_core_ids -/// \param [out] little_core_ids -/// \return If successful, it returns MACE_SUCCESS and error if it can't -/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK). -__attribute__((visibility("default"))) -MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, - std::vector *little_core_ids); -} // namespace mace - -#endif // MACE_PUBLIC_MACE_RUNTIME_H_ diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2 index 47287936..2bdda143 100644 --- a/mace/python/tools/mace_engine_factory.h.jinja2 +++ b/mace/python/tools/mace_engine_factory.h.jinja2 @@ -20,7 +20,6 @@ #include #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" namespace mace { @@ -57,8 +56,7 @@ std::map model_name_map { /// if model_data_format is code, just pass empty string("") /// \param input_nodes[in]: the array of input nodes' name /// \param output_nodes[in]: the array of output nodes' name -/// \param device_type[in]: one of [CPU, GPU, HEXAGON], -/// based on the runtime type of your model deployment file. +/// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object /// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// MACE_OUT_OF_RESOURCES for resources is out of range. @@ -67,7 +65,7 @@ MaceStatus CreateMaceEngineFromCode( const std::string &model_data_file, const std::vector &input_nodes, const std::vector &output_nodes, - const DeviceType device_type, + const MaceEngineConfig &config, std::shared_ptr *engine) { // load model if (engine == nullptr) { @@ -83,7 +81,7 @@ MaceStatus CreateMaceEngineFromCode( {% for i in range(model_tags |length) %} case {{ i }}: net_def = mace::{{model_tags[i]}}::CreateNet(); - engine->reset(new mace::MaceEngine(device_type)); + engine->reset(new mace::MaceEngine(config)); {% if embed_model_data %} model_data = mace::{{model_tags[i]}}::LoadModelData(); status = (*engine)->Init(net_def.get(), input_nodes, output_nodes, diff --git a/mace/test/BUILD b/mace/test/BUILD index 09c9e030..04253cda 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD @@ -1,6 +1,3 @@ -# Description: -# Mace operators. -# package( default_visibility = ["//visibility:public"], ) diff --git a/mace/test/mace_api_exception_test.cc b/mace/test/mace_api_exception_test.cc index 1eaad037..7507ffc8 100644 --- a/mace/test/mace_api_exception_test.cc +++ b/mace/test/mace_api_exception_test.cc @@ -23,7 +23,9 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { input_names.push_back(MakeString("input", 0)); output_names.push_back(MakeString("output", 0)); - const DeviceType device = DeviceType::GPU; + MaceEngineConfig config(DeviceType::GPU); + config.SetGPUContext( + ops::test::OpTestContext::Get()->gpu_context()); std::shared_ptr net_def(new NetDef()); for (size_t i = 0; i < input_names.size(); ++i) { @@ -31,7 +33,7 @@ TEST(MaceAPIExceptionTest, WrongInputTest) { info->set_name(input_names[i]); } - MaceEngine engine(device); + MaceEngine engine(config); ASSERT_DEATH(engine.Init(net_def.get(), {"input"}, output_names, nullptr), ""); } diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index e2a09fec..6d554bbe 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -18,7 +18,6 @@ #include "mace/core/operator.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" -#include "mace/public/mace_runtime.h" namespace mace { namespace test { @@ -200,7 +199,7 @@ void CheckOutputs(const NetDef &net_def, for (auto output : outputs) { std::unique_ptr tmp_tensor( - new Tensor(GetDeviceAllocator(DeviceType::CPU), + new Tensor(GetCPUAllocator(), DataTypeToEnum::v())); auto output_shape = output.second.shape(); const int64_t data_size = std::accumulate(output_shape.begin(), @@ -333,13 +332,9 @@ void MaceRunFunc(const int in_out_size) { OutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } + MaceEngineConfig config(DeviceType::GPU); - const std::string file_path ="/data/local/tmp/mace"; - std::shared_ptr storage_factory( - new FileStorageFactory(file_path)); - mace::SetKVStorageFactory(storage_factory); - - MaceEngine engine(device); + MaceEngine engine(config); MaceStatus status = engine.Init(net_def.get(), input_names, output_names, reinterpret_cast(data.data())); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); @@ -367,7 +362,7 @@ TEST_F(MaceMTAPITest, MultipleThread) { const int thread_num = 10; std::vector threads; for (int i = 0; i < thread_num; ++i) { - threads.push_back(std::thread(MaceRunFunc, i)); + threads.push_back(std::thread(MaceRunFunc, 1)); } for (auto &t : threads) { t.join(); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 6b1f353e..83d3b33d 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -18,7 +18,7 @@ #include "mace/core/operator.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" -#include "mace/public/mace_runtime.h" +#include "mace/public/mace.h" namespace mace { namespace test { @@ -199,9 +199,10 @@ void CheckOutputs(const NetDef &net_def, } net.RunNet(net_def, D); + std::unique_ptr allocator(new CPUAllocator); for (auto output : outputs) { std::unique_ptr tmp_tensor( - new Tensor(GetDeviceAllocator(DeviceType::CPU), + new Tensor(allocator.get(), DataTypeToEnum::v())); auto output_shape = output.second.shape(); const int64_t data_size = std::accumulate(output_shape.begin(), @@ -333,7 +334,9 @@ void MaceRun(const int in_out_size, info->set_name(output_names[i]); } - MaceEngine engine(device); + MaceEngineConfig config(DeviceType::GPU); + + MaceEngine engine(config); MaceStatus status = engine.Init(net_def.get(), input_names, output_names, reinterpret_cast(data.data())); EXPECT_EQ(status, MaceStatus::MACE_SUCCESS); diff --git a/mace/tools/quantization/quantize_stat.cc b/mace/tools/quantization/quantize_stat.cc index a05f42f7..936196e3 100644 --- a/mace/tools/quantization/quantize_stat.cc +++ b/mace/tools/quantization/quantize_stat.cc @@ -33,7 +33,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" @@ -122,8 +121,15 @@ bool RunModel(const std::string &model_name, const std::vector> &input_shapes, const std::vector &output_names, const std::vector> &output_shapes) { - MACE_RETURN_IF_ERROR(mace::SetOpenMPThreadPolicy( - FLAGS_omp_num_threads, CPUAffinityPolicy::AFFINITY_NONE)); + // config runtime + MaceStatus status; + MaceEngineConfig config(DeviceType::CPU); + status = config.SetCPUThreadPolicy( + FLAGS_omp_num_threads, + CPUAffinityPolicy::AFFINITY_NONE); + if (status != MACE_SUCCESS) { + LOG(WARNING) << "Set openmp or cpu affinity failed."; + } std::vector model_pb_data; if (FLAGS_model_file != "") { @@ -141,7 +147,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - DeviceType::CPU, + config, &engine)); #else (void) (model_name); @@ -150,7 +156,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - DeviceType::CPU, + config, &engine)); #endif diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 0aeefb78..3873e5dd 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -33,7 +33,6 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" -#include "mace/public/mace_runtime.h" #include "mace/utils/env_time.h" #include "mace/utils/logging.h" #include "mace/utils/utils.h" @@ -203,35 +202,37 @@ bool RunModel(const std::string &model_name, const std::vector> &output_shapes) { DeviceType device_type = ParseDeviceType(FLAGS_device); // config runtime - MaceStatus status = mace::SetOpenMPThreadPolicy( - FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy), - true); + MaceStatus status; + MaceEngineConfig config(device_type); + status = config.SetCPUThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy), + true); if (status != MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context; if (device_type == DeviceType::GPU) { - mace::SetGPUHints( - static_cast(FLAGS_gpu_perf_hint), - static_cast(FLAGS_gpu_priority_hint)); - + const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); + const std::string storage_path = + std::string(storage_path_ptr == nullptr ? + "/data/local/tmp/mace_run/interior" : storage_path_ptr); std::vector opencl_binary_paths = {FLAGS_opencl_binary_file}; - mace::SetOpenCLBinaryPaths(opencl_binary_paths); - mace::SetOpenCLParameterPath(FLAGS_opencl_parameter_file); + gpu_context = GPUContextBuilder() + .SetStoragePath(storage_path) + .SetOpenCLBinaryPaths(opencl_binary_paths) + .SetOpenCLParameterPath(FLAGS_opencl_parameter_file) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - const char *kernel_path = getenv("MACE_INTERNAL_STORAGE_PATH"); - const std::string kernel_file_path = - std::string(kernel_path == nullptr ? - "/data/local/tmp/mace_run/interior" : kernel_path); - - std::shared_ptr storage_factory( - new FileStorageFactory(kernel_file_path)); - SetKVStorageFactory(storage_factory); - std::vector model_pb_data; if (FLAGS_model_file != "") { if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) { @@ -252,7 +253,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else (void)(model_name); @@ -261,7 +262,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif int64_t t1 = NowMicros(); @@ -326,7 +327,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -334,7 +335,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif } while (create_engine_status != MACE_SUCCESS); @@ -366,7 +367,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #else create_engine_status = @@ -374,7 +375,7 @@ bool RunModel(const std::string &model_name, FLAGS_model_data_file, input_names, output_names, - device_type, + config, &engine); #endif } while (create_engine_status != MACE_SUCCESS); diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index e4007b66..3295ddae 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -15,6 +15,8 @@ #ifndef MACE_UTILS_TUNER_H_ #define MACE_UTILS_TUNER_H_ #include + +#include #include #include #include @@ -29,18 +31,24 @@ namespace mace { +inline bool IsTuning() { + const char *tuning = getenv("MACE_TUNING"); + return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; +} + template class Tuner { public: - static Tuner *Get() { - static Tuner tuner; - return &tuner; + explicit Tuner(const std::string tuned_param_file_path = ""): + tuned_param_file_path_(tuned_param_file_path) { + path_ = getenv("MACE_RUN_PARAMETER_PATH"); + ReadRunParamters(); } - inline bool IsTuning() { - const char *tuning = getenv("MACE_TUNING"); - return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; - } + ~Tuner() { WriteRunParameters(); } + + Tuner(const Tuner &) = delete; + Tuner &operator=(const Tuner &) = delete; template RetType TuneOrRun( @@ -76,16 +84,6 @@ class Tuner { } private: - Tuner() { - path_ = getenv("MACE_RUN_PARAMETER_PATH"); - ReadRunParamters(); - } - - ~Tuner() { WriteRunParameters(); } - - Tuner(const Tuner &) = delete; - Tuner &operator=(const Tuner &) = delete; - inline void WriteRunParameters() { if (path_ != nullptr) { VLOG(3) << "Write tuning result to " << path_; @@ -117,9 +115,9 @@ class Tuner { } inline void ReadRunParamters() { - extern std::string kOpenCLParameterPath; - if (!kOpenCLParameterPath.empty()) { - std::ifstream ifs(kOpenCLParameterPath, std::ios::binary | std::ios::in); + if (!tuned_param_file_path_.empty()) { + std::ifstream ifs(tuned_param_file_path_, + std::ios::binary | std::ios::in); if (ifs.is_open()) { int64_t num_params = 0; ifs.read(reinterpret_cast(&num_params), sizeof(num_params)); @@ -144,7 +142,7 @@ class Tuner { LOG(WARNING) << "Read OpenCL tuned parameters file failed."; } } else { - LOG(INFO) << "There is no tuned parameters."; + VLOG(1) << "There is no tuned parameters."; } } @@ -207,6 +205,7 @@ class Tuner { } private: + std::string tuned_param_file_path_; const char *path_; std::unordered_map> param_table_; }; diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc index bd590ac9..bff02b0b 100644 --- a/mace/utils/tuner_test.cc +++ b/mace/utils/tuner_test.cc @@ -42,15 +42,16 @@ TEST_F(TunerTest, SimpleRun) { } }; + Tuner tuner; WallClockTimer timer; std::vector default_params(1, 1); - int res = Tuner::Get()->template TuneOrRun( + int res = tuner.TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect, res); default_params[0] = 2; - res = Tuner::Get()->template TuneOrRun( + res = tuner.TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect + 1, res); } @@ -88,13 +89,14 @@ TEST_F(TunerTest, SimpleTune) { return {{1}, {2}, {3}, {4}}; }; // tune + Tuner tuner; WallClockTimer timer; - int res = Tuner::Get()->template TuneOrRun( + int res = tuner.TuneOrRun( "SimpleRun", default_params, *params_generator, TunerFunc, &timer); EXPECT_EQ(expect, res); // run - res = Tuner::Get()->template TuneOrRun( + res = tuner.template TuneOrRun( "SimpleRun", default_params, nullptr, TunerFunc, &timer); EXPECT_EQ(expect, res); } -- GitLab