diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md index 1b1910db81c7902bf56f91199d672dc5ec00e3d1..33a1a60d736f5431b1672e68609aff524b4e57be 100644 --- a/docs/development/adding_a_new_op.md +++ b/docs/development/adding_a_new_op.md @@ -5,107 +5,114 @@ You can create a custom op if it is not supported yet. To add a custom op, you need to follow these steps: -Define the Op class --------------------- -Define the new Op class in `mace/ops/my_custom_op.h`. - +Register the new OpDef information +---------------------------------- +Register the OpDef information about which devices the operation could run on. +Registry file is in `mace/ops/ops_def_register.cc` ```c++ -#ifndef MACE_OPS_MY_CUSTOM_OP_H_ -#define MACE_OPS_MY_CUSTOM_OP_H_ +#include "mace/ops/ops_def_register.h" + +namespace mace { +namespace ops { + +void RegisterOpDefs(OpDefRegistryBase *op_def_registry) { + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("MyCustomOp") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + ...... +} +} // namespace ops +} // namespace mace + +``` +Implement the Operation +----------------------- +The Best way is to refer to the implementation of other operator(e.g. `/mace/kernels/activation.cc`) + +Define the new Op class in `mace/kernels/my_custom_op.cc`. +1. CPU code: just write the code in `mace/kernels/my_custom_op.cc`. +2. GPU code: Kernel API is defined in `mace/kernels/my_custom_op.h`, +Kernel based on Image is realized in `mace/kernels/opencl/image/my_custom_op.cc`, +Kernel based on Buffer is realized in `mace/kernels/opencl/buffer/my_custom_op.cc`. + +The structure like the following code. +```c++ #include "mace/core/operator.h" -#include "mace/kernels/my_custom_op.h" namespace mace { -namespace ops { +namespace kernels { + +template +class MyCustomOp; -template -class MyCustomOp : public Operator { - public: - MyCustomOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws), - functor_() {} - - bool Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - - functor_(input, output, future); - return true; - } - - protected: - OP_INPUT_TAGS(INPUT); - OP_OUTPUT_TAGS(OUTPUT); - - private: - kernels::MyCustomOpFunctor functor_; +template <> +class MyCustomOp : public Operation { +... +} + +#ifdef MACE_ENABLE_OPENCL +template +class ActivationOp : public Operation { +... }; +#endif // MACE_ENABLE_OPENCL } // namespace ops } // namespace mace -#endif // MACE_OPS_MY_CUSTOM_OP_H_ - ``` -Register the new Op --------------------- -Define the Ops registering function in `mace/ops/my_custom_op.cc`. +Register the Operation +----------------------- +1, Add register function in `mace/kernels/my_custom_op.cc` ```c++ -#include "mace/ops/my_custom_op.h" +#include "mace/core/operator.h" namespace mace { -namespace ops { +namespace kernels { -void Register_My_Custom_Op(OperatorRegistryBase *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Custom_Op); - - REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - Custom_Op); - - REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), - Custom_Op); -} +void RegisterMyCustomOp(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp, + DeviceType::CPU, float); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} } // namespace ops } // namespace mace - ``` -And then register the new Op in `mace/ops/ops_register.cc`. +2, And then register the new Op in `mace/kernels/ops_register.cc`. ``` -#include "mace/ops/ops_register.h" +#include "mace/kernels/ops_register.h" namespace mace { - namespace ops { // Keep in lexicographical order ... -extern void Register_My_Custom_Op(OperatorRegistryBase *op_registry); +extern void RegisterMyCustomOp(OpRegistryBase *op_registry); ... } // namespace ops -OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() { +OpRegistry::OpRegistry() : OpRegistryBase() { // Keep in lexicographical order ... - ops::Register_My_Custom_Op(this); + ops::RegisterMyCustomOp(this); ... @@ -113,16 +120,13 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() { } // namespace mace ``` +Add UTs +---------------------- +Add operation unit tests in `mace/ops/my_custom_op_test.cc` -Implement the Op kernel code ----------------------------- -You need to implement the CPU kernel in a `mace/kernels/my_custom_op.h` and -optionally OpenCL kernel in `mace/kernels/kernels/my_custom_op_opencl.cc` and -`mace/kernels/kernels/cl/my_custom_op.cl`. You can also optimize the CPU -kernel with NEON. - -Add test and benchmark +Add benchmark ---------------------- +Add operation benchmark in `mace/ops/my_custom_op_benchmark.cc` It's strongly recommended to add unit tests and micro benchmarks for your new Op. If you wish to contribute back, it's required. diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 26fb2d0b0e4e17355efc9122c00a25147ffe00ba..7f0afe2405c2bd6f07545a34f7b5deaa17ebd145 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -263,7 +263,7 @@ int Main(int argc, char **argv) { FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy), true); - if (mace_status != MACE_SUCCESS) { + if (mace_status != MaceStatus::MACE_SUCCESS) { LOG(INFO) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD index 8a24594c15662dcb04d1c10772000acc0488f835..5122da1dac77fc5d28ba92930267a6d87b6eccad 100644 --- a/mace/codegen/BUILD +++ b/mace/codegen/BUILD @@ -50,6 +50,7 @@ cc_library( copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], deps = [ "//mace/public", + "//mace/utils", ], ) diff --git a/mace/core/future.h b/mace/core/future.h index f5807f54daabc9c1bba6e4ed29b1d5cfb8e0861b..6fb82d98e706b60a0f1ec8ea7ac633a77fcafc7d 100644 --- a/mace/core/future.h +++ b/mace/core/future.h @@ -27,7 +27,12 @@ struct CallStats; // Wait the call to finish and get the stats if param is not nullptr struct StatsFuture { - std::function wait_fn; + std::function wait_fn = [](CallStats *stats) { + if (stats != nullptr) { + stats->start_micros = NowMicros(); + stats->end_micros = stats->start_micros; + } + }; }; inline void SetFutureDefaultWaitFn(StatsFuture *future) { diff --git a/mace/core/net.cc b/mace/core/net.cc index d71a14826a5ae8b907a56526ba79c6bce245e12d..757b4831120899ebea17699f0cc22403713a93f4 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -16,8 +16,10 @@ #include #include +#include "mace/core/future.h" #include "mace/core/macros.h" #include "mace/core/net.h" +#include "mace/core/op_context.h" #include "mace/public/mace.h" #include "mace/utils/memory_logging.h" #include "mace/utils/timer.h" @@ -25,39 +27,60 @@ namespace mace { -NetBase::NetBase(const std::shared_ptr op_registry, - const std::shared_ptr net_def, - Workspace *ws, - Device *device) - : op_registry_(op_registry) { - MACE_UNUSED(net_def); - MACE_UNUSED(ws); - MACE_UNUSED(device); -} - -SerialNet::SerialNet( - const std::shared_ptr op_registry, - const std::shared_ptr net_def, - Workspace *ws, - Device *device, - const NetMode mode) - : NetBase(op_registry, net_def, ws, device), device_(device), - op_kernel_context_(new OpKernelContext(ws, device)) { +SerialNet::SerialNet(OpDefRegistryBase *op_def_registry, + const OpRegistryBase *op_registry, + const NetDef *net_def, + Workspace *ws, + Device *target_device, + const NetMode mode) + : NetBase(), + ws_(ws), + target_device_(target_device), + cpu_device_( + new CPUDevice(target_device->cpu_runtime()->num_threads(), + target_device->cpu_runtime()->policy(), + target_device->cpu_runtime()->use_gemmlowp())) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); - DeviceType device_type = device->device_type(); + // Register Operations + MaceStatus status; + for (int idx = 0; idx < net_def->op_types_size(); ++idx) { + status = op_def_registry->Register(net_def->op_types(idx)); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information()); + } + // Create Operations + operators_.clear(); + const OpRegistrationInfo *info; + DeviceType target_device_type = target_device_->device_type(); + OpConstructContext construct_context(ws_); for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto &operator_def = net_def->op(idx); - // TODO(liuqi): refactor to add device_type to OperatorDef + // Create the Operation const int op_device = ProtoArgHelper::GetOptionalArg( - operator_def, "device", static_cast(device_type)); - if (op_device == device_type) { - VLOG(3) << "Creating operator " << operator_def.name() << "(" - << operator_def.type() << ")"; + operator_def, "device", static_cast(target_device_type)); + if (op_device == target_device_type) { + // Find op registration information + status = op_def_registry->Find(operator_def.type(), &info); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information()); + // Get available devices (sorted based on priority) OperatorDef temp_def(operator_def); - std::unique_ptr op( - op_registry->CreateOperator(temp_def, op_kernel_context_.get(), - device_type, mode)); + auto available_devices = info->device_place_func_(); + // Find the device type to run the op. + // If the target_device_type in available devices, use target_device_type, + // otherwise, fallback to the first device (top priority). + DeviceType device_type = available_devices[0]; + construct_context.set_device(cpu_device_); + for (auto device : available_devices) { + if (device == target_device_type) { + device_type = target_device_type; + construct_context.set_device(target_device_); + break; + } + } + temp_def.set_device_type(device_type); + construct_context.set_operator_def(&temp_def); + std::unique_ptr op( + op_registry->CreateOperation(&construct_context, device_type, mode)); if (op) { operators_.emplace_back(std::move(op)); } @@ -65,38 +88,59 @@ SerialNet::SerialNet( } } +MaceStatus SerialNet::Init() { + // TODO(liuqi): where to do memory reuse. + MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); + OpInitContext init_context(ws_); + for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { + auto &op = *iter; + DeviceType device_type = op->device_type(); + if (device_type == target_device_->device_type()) { + init_context.set_device(target_device_); + } else { + init_context.set_device(cpu_device_); + } + // Initialize the operation + MACE_RETURN_IF_ERROR(op->Init(&init_context)); + } + return MaceStatus::MACE_SUCCESS; +} + MaceStatus SerialNet::Run(RunMetadata *run_metadata) { + // TODO(liuqi): In/Out Buffer Transform MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); - const DeviceType device_type = device_->device_type(); + OpContext context(ws_, cpu_device_); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; - MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(", - op->debug_def().type(), "), mem_id: ", + DeviceType device_type = op->device_type(); + MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), + "<", device_type, ", ", op->debug_def().type(), ">", + ". mem_id: ", MakeListString(op->debug_def().mem_id().data(), op->debug_def().mem_id().size())); - bool future_wait = (device_type == DeviceType::GPU && - (run_metadata != nullptr || - std::distance(iter, operators_.end()) == 1)); + if (device_type == target_device_->device_type()) { + context.set_device(target_device_); + } else { + context.set_device(cpu_device_); + } CallStats call_stats; - if (future_wait) { - StatsFuture future; - MACE_RETURN_IF_ERROR(op->Run(&future)); - if (run_metadata != nullptr) { + if (run_metadata == nullptr) { + MACE_RETURN_IF_ERROR(op->Run(&context)); + } else { + if (device_type == DeviceType::CPU) { + call_stats.start_micros = NowMicros(); + MACE_RETURN_IF_ERROR(op->Run(&context)); + call_stats.end_micros = NowMicros(); + } else if (device_type == DeviceType::GPU) { + StatsFuture future; + context.set_future(&future); + MACE_RETURN_IF_ERROR(op->Run(&context)); future.wait_fn(&call_stats); - } else { - future.wait_fn(nullptr); } - } else if (run_metadata != nullptr) { - call_stats.start_micros = NowMicros(); - MACE_RETURN_IF_ERROR(op->Run(nullptr)); - call_stats.end_micros = NowMicros(); - } else { - MACE_RETURN_IF_ERROR(op->Run(nullptr)); - } - if (run_metadata != nullptr) { + // Record run metadata std::vector strides; int padding_type = -1; std::vector paddings; @@ -150,19 +194,20 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { << "@@" << min_v << "," << max_v; } } else { + const int bin_size = 2048; for (int ind = 0; ind < op->debug_def().quantize_info_size(); ++ind) { float min_v = op->debug_def().quantize_info(ind).minval(); float max_v = op->debug_def().quantize_info(ind).maxval(); - std::vector bin_distribution(kBinSize, 0); - float bin_v = (max_v - min_v) / kBinSize; + std::vector bin_distribution(bin_size, 0); + float bin_v = (max_v - min_v) / bin_size; Tensor::MappingGuard guard(op->Output(i)); const float *output_data = op->Output(i)->data(); for (index_t j = 0; j < op->Output(i)->size(); ++j) { int ind = static_cast((output_data[j] - min_v) / bin_v); if (ind < 0) ind = 0; - else if (ind > kBinSize-1) - ind = kBinSize-1; + else if (ind > bin_size-1) + ind = bin_size-1; bin_distribution[ind]++; } LOG(INFO) << "Tensor range @@" << op->debug_def().output(i) @@ -174,28 +219,6 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } - -std::unique_ptr CreateNet( - const std::shared_ptr op_registry, - const NetDef &net_def, - Workspace *ws, - Device *device, - const NetMode mode) { - std::shared_ptr tmp_net_def(new NetDef(net_def)); - return CreateNet(op_registry, tmp_net_def, ws, device, mode); -} - -std::unique_ptr CreateNet( - const std::shared_ptr op_registry, - const std::shared_ptr net_def, - Workspace *ws, - Device *device, - const NetMode mode) { - std::unique_ptr net( - new SerialNet(op_registry, net_def, ws, device, mode)); - return net; -} - } // namespace mace diff --git a/mace/core/net.h b/mace/core/net.h index ecff907b739564f81e55b5f2bee0111cc52b3146..799e07d4e6474b7e857870d40e4caa6f398ac0b1 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -21,64 +21,51 @@ #include #include -#include "mace/core/operator.h" -#include "mace/utils/string_util.h" +#include "mace/core/op_def_registry.h" -#define kBinSize 2048 +#include "mace/core/operator.h" namespace mace { class RunMetadata; -class OperatorBase; class Workspace; class NetBase { public: - NetBase(const std::shared_ptr op_registry, - const std::shared_ptr net_def, - Workspace *ws, - Device *device); - virtual ~NetBase() noexcept {} + NetBase() noexcept = default; + virtual ~NetBase() = default; + + virtual MaceStatus Init() = 0; virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0; protected: - const std::shared_ptr op_registry_; - MACE_DISABLE_COPY_AND_ASSIGN(NetBase); }; class SerialNet : public NetBase { public: - SerialNet(const std::shared_ptr op_registry, - const std::shared_ptr net_def, + SerialNet(OpDefRegistryBase *op_def_registry, + const OpRegistryBase *op_registry, + const NetDef *net_def, Workspace *ws, - Device *device, + Device *target_device, const NetMode mode = NetMode::NORMAL); + MaceStatus Init() override; + MaceStatus Run(RunMetadata *run_metadata = nullptr) override; protected: - std::vector > operators_; - Device *device_; - std::unique_ptr op_kernel_context_; + Workspace *ws_; + Device *target_device_; + // CPU is base device. + Device *cpu_device_; + std::vector > operators_; MACE_DISABLE_COPY_AND_ASSIGN(SerialNet); }; -std::unique_ptr CreateNet( - const std::shared_ptr op_registry, - const NetDef &net_def, - Workspace *ws, - Device *device, - const NetMode mode = NetMode::NORMAL); -std::unique_ptr CreateNet( - const std::shared_ptr op_registry, - const std::shared_ptr net_def, - Workspace *ws, - Device *device, - const NetMode mode = NetMode::NORMAL); - } // namespace mace #endif // MACE_CORE_NET_H_ diff --git a/mace/core/op_kernel_context.cc b/mace/core/op_context.cc similarity index 61% rename from mace/core/op_kernel_context.cc rename to mace/core/op_context.cc index 20f9e561a43ea58179818fcf03989020bf6692a5..a26b5e229f61f7fa5eba73301d3b6ae813f9b0ae 100644 --- a/mace/core/op_kernel_context.cc +++ b/mace/core/op_context.cc @@ -12,21 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/op_kernel_context.h" +#include "mace/core/op_context.h" namespace mace { -OpKernelContext::OpKernelContext(Workspace *ws, Device *device) - : device_(device), ws_(ws) {} +OpContext::OpContext(Workspace *ws, Device *device) + : device_(device), ws_(ws), future_(nullptr) {} -OpKernelContext::~OpKernelContext() = default; +OpContext::~OpContext() = default; -Device* OpKernelContext::device() { +void OpContext::set_device(Device *device) { + device_ = device; +} + +Device* OpContext::device() { return device_; } -Workspace* OpKernelContext::workspace() { +Workspace* OpContext::workspace() { return ws_; } +void OpContext::set_future(StatsFuture *future) { + future_ = future; +} + +StatsFuture *OpContext::future() { + return future_; +} + } // namespace mace diff --git a/mace/core/op_kernel_context.h b/mace/core/op_context.h similarity index 70% rename from mace/core/op_kernel_context.h rename to mace/core/op_context.h index fe5e777cd5b5647ccb42f684fb1363224740333e..6772b14fca9bdd7dc4c623a614e386abd345cdde 100644 --- a/mace/core/op_kernel_context.h +++ b/mace/core/op_context.h @@ -12,23 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_ -#define MACE_CORE_OP_KERNEL_CONTEXT_H_ +#ifndef MACE_CORE_OP_CONTEXT_H_ +#define MACE_CORE_OP_CONTEXT_H_ #include "mace/core/device.h" #include "mace/core/workspace.h" +#include "mace/core/future.h" + namespace mace { -class OpKernelContext { +class OpContext { public: - OpKernelContext(Workspace *ws, Device *device); - ~OpKernelContext(); + OpContext(Workspace *ws, Device *device); + ~OpContext(); + void set_device(Device *device); Device *device(); Workspace *workspace(); + + void set_future(StatsFuture *future); + StatsFuture *future(); private: Device *device_; Workspace *ws_; + StatsFuture *future_; + // metadata }; } // namespace mace -#endif // MACE_CORE_OP_KERNEL_CONTEXT_H_ +#endif // MACE_CORE_OP_CONTEXT_H_ diff --git a/mace/core/op_def_registry.cc b/mace/core/op_def_registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..7bb8de9e7aecd29a6dd684cea2fca6745223fbaf --- /dev/null +++ b/mace/core/op_def_registry.cc @@ -0,0 +1,77 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/op_def_registry.h" +#include "mace/utils/logging.h" + +namespace mace { + +void AddOpRegistrar(OpDefRegistryBase *registry, + const OpRegistrationBuilder &builder) { + registry->AddRegistrar( + builder.name(), + [builder](OpRegistrationInfo *info){ + builder.Finalize(info); + }); +} + +OpRegistrationBuilder::OpRegistrationBuilder(const std::string name) + : name_(name) {} + +const std::string OpRegistrationBuilder::name() const { return name_; } + +OpRegistrationBuilder &OpRegistrationBuilder::SetDevicePlaceFunc( + std::vector (*func)()) { + info_.device_place_func_ = func; + return *this; +} + +void OpRegistrationBuilder::Finalize(OpRegistrationInfo *info) const { + *info = info_; +} + +void OpDefRegistryBase::AddRegistrar(const std::string name, + const OpRegistrar ®istrar) { + registrar_.emplace(name, registrar); +} + +MaceStatus OpDefRegistryBase::Register(const std::string &name) { + VLOG(3) << "Registering operation definition: " << name; + if (registry_.find(name) != registry_.end()) { + return MaceStatus::MACE_SUCCESS; + } + auto iter = registrar_.find(name); + if (iter == registrar_.end()) { + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "MACE do not support the operation: " + name); + } + registry_.emplace( + name, std::unique_ptr(new OpRegistrationInfo())); + iter->second(registry_[name].get()); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus OpDefRegistryBase::Find(const std::string &name, + const OpRegistrationInfo **info) { + auto iter = registry_.find(name); + if (iter == registry_.end()) { + *info = nullptr; + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "Mace do not support the operation: " + name); + } + *info = iter->second.get(); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace mace diff --git a/mace/core/op_def_registry.h b/mace/core/op_def_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..8e0156587b9381cc1494e5545b9cc40978ad4e21 --- /dev/null +++ b/mace/core/op_def_registry.h @@ -0,0 +1,81 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_OP_DEF_REGISTRY_H_ +#define MACE_CORE_OP_DEF_REGISTRY_H_ + +#include +#include +#include +#include +#include + +#include "mace/proto/mace.pb.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +// Device placement function +typedef std::function()> DevicePlaceFunc; + +struct OpRegistrationInfo { + OpRegistrationInfo() = default; + explicit OpRegistrationInfo(const DevicePlaceFunc &func) + : device_place_func_(func) {} + + DevicePlaceFunc device_place_func_; +}; + +class OpRegistrationBuilder { + public: + explicit OpRegistrationBuilder(const std::string name); + + const std::string name() const; + + OpRegistrationBuilder &SetDevicePlaceFunc( + std::vector (*func)()); + + void Finalize(OpRegistrationInfo *info) const; + private: + std::string name_; + OpRegistrationInfo info_; +}; + +class OpDefRegistryBase { + public: + typedef std::function OpRegistrar; + OpDefRegistryBase() = default; + virtual ~OpDefRegistryBase() = default; + void AddRegistrar(const std::string name, const OpRegistrar ®istrar); + MaceStatus Register(const std::string &name); + MaceStatus Find(const std::string &name, const OpRegistrationInfo **info); + + private: + std::unordered_map registrar_; + std::unordered_map< + std::string, + std::unique_ptr> registry_; + MACE_DISABLE_COPY_AND_ASSIGN(OpDefRegistryBase); +}; + +void AddOpRegistrar(OpDefRegistryBase *registry, + const OpRegistrationBuilder &builder); + +#define MACE_REGISTER_OP_DEF(op_def_registry, builder) \ + AddOpRegistrar(op_def_registry, builder) + +} // namespace mace + +#endif // MACE_CORE_OP_DEF_REGISTRY_H_ diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 5e4048358bfc7717da3a09e93899800750bb157a..d29c84e30ea9876a2b8c67f96723012f8429d641 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -14,18 +14,69 @@ #include #include -#include #include #include "mace/core/operator.h" -#include "mace/core/op_kernel_context.h" namespace mace { -OperatorBase::OperatorBase(const OperatorDef &operator_def, - OpKernelContext *context) - : operator_def_(std::make_shared(operator_def)) { - MACE_UNUSED(context); +OpConstructContext::OpConstructContext(Workspace *ws) + : operator_def_(nullptr), ws_(ws), device_(nullptr) {} +OpConstructContext::OpConstructContext(OperatorDef *operator_def, + Workspace *ws, + Device *device) + : operator_def_(operator_def), ws_(ws), device_(device) {} + +OpInitContext::OpInitContext(Workspace *ws, Device *device) + : ws_(ws), device_(device) {} + +Operation::Operation(OpConstructContext *context) + : operator_def_(std::make_shared(*(context->operator_def()))) +{} + +MaceStatus Operation::Init(OpInitContext *context) { + Workspace *ws = context->workspace(); + for (const std::string &input_str : operator_def_->input()) { + const Tensor *tensor = ws->GetTensor(input_str); + MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(), + ": Encountered a non-existing input tensor: ", input_str); + inputs_.push_back(tensor); + } + // TODO(liuqi): filter transform + for (int i = 0; i < operator_def_->output_size(); ++i) { + const std::string output_str = operator_def_->output(i); + if (ws->HasTensor(output_str)) { + // TODO(liuqi): Workspace should pre-allocate all of the output tensors + outputs_.push_back(ws->GetTensor(output_str)); + } else { + MACE_CHECK( + operator_def_->output_type_size() == 0 || + operator_def_->output_size() == operator_def_->output_type_size(), + "operator output size != operator output type size", + operator_def_->output_size(), + operator_def_->output_type_size()); + DataType output_type; + if (i < operator_def_->output_type_size()) { + output_type = operator_def_->output_type(i); + } else { + output_type = static_cast( + ProtoArgHelper::GetOptionalArg( + *operator_def_, "T", static_cast(DT_FLOAT))); + } + outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( + output_str, context->device()->allocator(), output_type))); + + if (i < operator_def_->output_shape_size()) { + std::vector + shape_configured(operator_def_->output_shape(i).dims_size()); + for (size_t dim = 0; dim < shape_configured.size(); ++dim) { + shape_configured[dim] = operator_def_->output_shape(i).dims(dim); + } + ws->GetTensor(output_str)->SetShapeConfigured(shape_configured); + } + } + } + return MaceStatus::MACE_SUCCESS; } OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {} @@ -36,7 +87,7 @@ OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) { } OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name, - const DataType allowed) { + DataType allowed) { type_constraint_[attr_name] = allowed; return *this; } @@ -53,27 +104,28 @@ const std::string OpKeyBuilder::Build() { return ss.str(); } -OperatorRegistryBase::~OperatorRegistryBase() {} +OpRegistryBase::~OpRegistryBase() = default; -std::unique_ptr OperatorRegistryBase::CreateOperator( - const OperatorDef &operator_def, - OpKernelContext *context, - DeviceType type, +std::unique_ptr OpRegistryBase::CreateOperation( + OpConstructContext *context, + DeviceType device_type, const NetMode mode) const { + OperatorDef *operator_def = context->operator_def(); const int dtype = ProtoArgHelper::GetOptionalArg( - operator_def, "T", static_cast(DT_FLOAT)); + *operator_def, "T", static_cast(DT_FLOAT)); const int op_mode_i = ProtoArgHelper::GetOptionalArg( - operator_def, "mode", static_cast(NetMode::NORMAL)); + *operator_def, "mode", static_cast(NetMode::NORMAL)); const NetMode op_mode = static_cast(op_mode_i); - VLOG(3) << "Creating operator " << operator_def.name() << "(" - << operator_def.type() << "<" << dtype << ">" << ")"; + VLOG(3) << "Creating operator " << operator_def->name() << "(" + << operator_def->type() << "<" << dtype << ">" << ") on " + << device_type; if (op_mode == mode) { return registry_.Create( - OpKeyBuilder(operator_def.type().data()) - .Device(type) + OpKeyBuilder(operator_def->type().data()) + .Device(device_type) .TypeConstraint("T", static_cast(dtype)) .Build(), - operator_def, context); + context); } else { return nullptr; } diff --git a/mace/core/operator.h b/mace/core/operator.h index e0b84535a91199a97b1989d5e0b62d323eb29192..34de7e72be02e43919c8d81d83b1d9f60af0f0a3 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -21,8 +21,7 @@ #include #include "mace/core/arg_helper.h" -#include "mace/core/future.h" -#include "mace/core/op_kernel_context.h" +#include "mace/core/op_context.h" #include "mace/core/registry.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" @@ -30,10 +29,66 @@ namespace mace { -class OperatorBase { +// memory_optimizer, device +class OpConstructContext { public: - explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *); - virtual ~OperatorBase() noexcept {} + explicit OpConstructContext(Workspace *ws); + OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device); + ~OpConstructContext() = default; + + inline void set_operator_def(OperatorDef *operator_def) { + operator_def_ = operator_def; + } + + inline OperatorDef *operator_def() const { + return operator_def_; + } + + inline Workspace *workspace() const { + return ws_; + } + + inline void set_device(Device* device) { + device_ = device; + } + + inline Device *device() const { + return device_; + } + + private: + OperatorDef *operator_def_; + Workspace *ws_; + Device *device_; +}; + +// memory_optimizer, device +class OpInitContext { + public: + explicit OpInitContext(Workspace *ws, Device *device = nullptr); + ~OpInitContext() = default; + + inline Workspace *workspace() const { + return ws_; + } + + inline void set_device(Device *device) { + device_ = device; + } + + inline Device *device() const { + return device_; + } + + private: + Workspace *ws_; + Device *device_; +}; + +class Operation { + public: + explicit Operation(OpConstructContext *context); + virtual ~Operation() = default; template inline T GetOptionalArg(const std::string &name, @@ -50,6 +105,10 @@ class OperatorBase { *operator_def_, name, default_value); } + inline DeviceType device_type() const { + return static_cast(operator_def_->device_type()); + } + inline const Tensor *Input(unsigned int idx) { MACE_CHECK(idx < inputs_.size()); return inputs_[idx]; @@ -63,7 +122,8 @@ class OperatorBase { inline const std::vector &Outputs() { return outputs_; } // Run Op asynchronously (depends on device), return a future if not nullptr. - virtual MaceStatus Run(StatsFuture *future) = 0; + virtual MaceStatus Init(OpInitContext *); + virtual MaceStatus Run(OpContext *) = 0; inline const OperatorDef &debug_def() const { MACE_CHECK(has_debug_def(), "operator_def was null!"); @@ -82,55 +142,7 @@ class OperatorBase { std::vector inputs_; std::vector outputs_; - MACE_DISABLE_COPY_AND_ASSIGN(OperatorBase); -}; - -template -class Operator : public OperatorBase { - public: - explicit Operator(const OperatorDef &operator_def, OpKernelContext *context) - : OperatorBase(operator_def, context) { - Workspace *ws = context->workspace(); - for (const std::string &input_str : operator_def.input()) { - const Tensor *tensor = ws->GetTensor(input_str); - MACE_CHECK(tensor != nullptr, "op ", operator_def.type(), - ": Encountered a non-existing input tensor: ", input_str); - inputs_.push_back(tensor); - } - - for (int i = 0; i < operator_def.output_size(); ++i) { - const std::string output_str = operator_def.output(i); - if (ws->HasTensor(output_str)) { - outputs_.push_back(ws->GetTensor(output_str)); - } else { - MACE_CHECK( - operator_def.output_type_size() == 0 - || operator_def.output_size() == operator_def.output_type_size(), - "operator output size != operator output type size", - operator_def.output_size(), - operator_def.output_type_size()); - DataType output_type; - if (i < operator_def.output_type_size()) { - output_type = operator_def.output_type(i); - } else { - output_type = DataTypeToEnum::v(); - } - outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( - output_str, context->device()->allocator(), output_type))); - - if (i < operator_def.output_shape_size()) { - std::vector - shape_configured(operator_def.output_shape(i).dims_size()); - for (size_t dim = 0; dim < shape_configured.size(); ++dim) { - shape_configured[dim] = operator_def.output_shape(i).dims(dim); - } - ws->GetTensor(output_str)->SetShapeConfigured(shape_configured); - } - } - } - } - MaceStatus Run(StatsFuture *future) override = 0; - ~Operator() noexcept override {} + MACE_DISABLE_COPY_AND_ASSIGN(Operation); }; // MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the @@ -154,7 +166,8 @@ class OpKeyBuilder { OpKeyBuilder &Device(DeviceType device); - OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed); + OpKeyBuilder &TypeConstraint(const char *attr_name, + DataType allowed); template OpKeyBuilder &TypeConstraint(const char *attr_name); @@ -172,33 +185,37 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) { return this->TypeConstraint(attr_name, DataTypeToEnum::value); } -class OperatorRegistryBase { +class OpRegistryBase { public: typedef Registry + Operation, + OpConstructContext *> RegistryType; - OperatorRegistryBase() = default; - virtual ~OperatorRegistryBase(); + OpRegistryBase() = default; + virtual ~OpRegistryBase(); RegistryType *registry() { return ®istry_; } - std::unique_ptr CreateOperator(const OperatorDef &operator_def, - OpKernelContext *context, - DeviceType type, - const NetMode mode) const; + std::unique_ptr CreateOperation( + OpConstructContext *context, + DeviceType device_type, + const NetMode mode) const; private: RegistryType registry_; - MACE_DISABLE_COPY_AND_ASSIGN(OperatorRegistryBase); + MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase); }; MACE_DECLARE_REGISTRY(OpRegistry, - OperatorBase, - const OperatorDef &, - OpKernelContext *); - -#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \ - MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__) + Operation, + OpConstructContext *); + +#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \ + MACE_REGISTER_CLASS(OpRegistry, \ + op_registry->registry(), \ + OpKeyBuilder(op_type) \ + .Device(device) \ + .TypeConstraint
("T") \ + .Build(), \ + class_name) } // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index ac8a358240d88bbd3f81f3d94456f5bb9daa7c58..ae168a547090c5e133f32b714d6d80f77b374987 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -18,12 +18,13 @@ #include #endif -#include #include #include #include -#include #include +#include +#include +#include #include #include @@ -85,9 +86,10 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { int err = sched_setaffinity(pid, sizeof(mask), &mask); if (err) { LOG(WARNING) << "set affinity error: " << strerror(errno); - return MACE_INVALID_ARGS; + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "set affinity error: " + std::string(strerror(errno))); } else { - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } @@ -104,7 +106,9 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, if (cpu_max_freq[i] == 0) { LOG(WARNING) << "Cannot get CPU" << i << "'s max frequency info, maybe it is offline."; - return MACE_INVALID_ARGS; + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "Cannot get CPU's max frequency info," + " maybe it is offline."); } } @@ -124,7 +128,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, @@ -147,7 +151,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, CPU_SET(cpu_id, &mask); } #ifdef MACE_ENABLE_OPENMP - std::vector status(omp_num_threads); + std::vector status(omp_num_threads, + MaceStatus::MACE_INVALID_ARGS); #pragma omp parallel for for (int i = 0; i < omp_num_threads; ++i) { VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num() @@ -155,10 +160,10 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, status[i] = SetThreadAffinity(mask); } for (int i = 0; i < omp_num_threads; ++i) { - if (status[i] != MACE_SUCCESS) - return MACE_INVALID_ARGS; + if (status[i] != MaceStatus::MACE_SUCCESS) + return MaceStatus::MACE_INVALID_ARGS; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; #else MaceStatus status = SetThreadAffinity(mask); VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0]; @@ -183,13 +188,13 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( #else LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } std::vector big_core_ids; std::vector little_core_ids; MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids); - if (res != MACE_SUCCESS) { + if (res != MaceStatus::MACE_SUCCESS) { return res; } diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 4b0f796b29b19985b1e63c41f070c8ee86ea7f81..a6926e9e1e1198a704fa68d119111ea70c701079 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -55,6 +55,14 @@ class CPURuntime { return num_threads_; } + CPUAffinityPolicy policy() const { + return policy_; + } + + bool use_gemmlowp() const { + return gemm_context_ != nullptr; + } + private: MaceStatus SetOpenMPThreadsAndAffinityPolicy( int omp_num_threads_hint, diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc index 112a94bff7dd3ee0c24e56aae1431b24b258ce50..09bb91816d0ff6aff45b68c85473d4a89b0ddc79 100644 --- a/mace/core/runtime/opencl/gpu_device.cc +++ b/mace/core/runtime/opencl/gpu_device.cc @@ -38,7 +38,7 @@ OpenCLRuntime* GPUDevice::opencl_runtime() { return runtime_.get(); } -Allocator* GPUDevice::allocator() { +Allocator *GPUDevice::allocator() { return allocator_.get(); } diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc index ae5bbc78f58df7ebbcf24abe49118c03960afa18..b414782a8462cee1bf596a9cae00718b30ff9844 100755 --- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc +++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc @@ -27,6 +27,7 @@ #include "src/main/cpp/include/mace/public/mace.h" #include "src/main/cpp/include/mace/public/mace_engine_factory.h" +#include "mace/public/mace.h" namespace { @@ -112,11 +113,12 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( omp_num_threads, static_cast(cpu_affinity_policy), true); - if (status != mace::MACE_SUCCESS) { + if (status != mace::MaceStatus::MACE_SUCCESS) { __android_log_print(ANDROID_LOG_ERROR, "image_classify attrs", - "openmp result: %d, threads: %d, cpu: %d", - status, omp_num_threads, cpu_affinity_policy); + "openmp result: %s, threads: %d, cpu: %d", + status.information().c_str(), omp_num_threads, + cpu_affinity_policy); } if (mace_context.device_type == mace::DeviceType::GPU) { config.SetGPUContext(mace_context.gpu_context); @@ -163,8 +165,8 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( __android_log_print(ANDROID_LOG_INFO, "image_classify attrs", - "create result: %d", - create_engine_status); + "create result: %s", + create_engine_status.information().c_str()); return create_engine_status == mace::MaceStatus::MACE_SUCCESS ? JNI_OK : JNI_ERR; diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 204be499ebd0c1500072da106f39800e2fca1384..73adbb75dff9e02a84fd5c1520a2330c25054f39 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -170,7 +170,7 @@ bool RunModel(const std::vector &input_names, status = config.SetCPUThreadPolicy( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy)); - if (status != MACE_SUCCESS) { + if (status != MaceStatus::MACE_SUCCESS) { std::cerr << "Set openmp or cpu affinity failed." << std::endl; } #ifdef MACE_ENABLE_OPENCL diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 6b37cf5060376493d0925f8eeca8360e83261f84..885285784ef6ed53fc911e629e1df8f712f1e5fa 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -28,12 +28,20 @@ cc_library( "*_test.cc", "*_benchmark.cc", "arm/*_test.cc", + "buffer_inverse_transform.cc", + "buffer_transform.cc", + "lstm_cell.cc", + "winograd_transform.cc", ], ) + if_opencl_enabled(glob( [ "opencl/*.cc", "opencl/image/*.cc", "opencl/buffer/*.cc", + "buffer_inverse_transform.cc", + "buffer_transform.cc", + "lstm_cell.cc", + "winograd_transform.cc", ], exclude = [ "opencl/*_test.cc", @@ -44,18 +52,10 @@ cc_library( "*.h", "arm/*.h", ], - exclude = [ - "buffer_transform.h", - "buffer_inverse_transform.h", - "lstmcell.h", - ], ) + if_opencl_enabled(glob([ "opencl/*.h", "opencl/image/*.h", "opencl/buffer/*.h", - "buffer_transform.h", - "buffer_inverse_transform.h", - "lstmcell.h", ])), copts = [ "-Werror", @@ -77,7 +77,6 @@ cc_library( linkopts = if_android(["-lm"]), deps = [ "//mace/core", - "//mace/utils", "@gemmlowp", "@tflite", ], diff --git a/mace/kernels/activation.cc b/mace/kernels/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..038c45494393a289f2487890458c540e053ff2da --- /dev/null +++ b/mace/kernels/activation.cc @@ -0,0 +1,118 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/activation.h" + +#include + +#include "mace/core/operator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/activation.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +template +class ActivationOp; + +template <> +class ActivationOp : public Operation { + public: + explicit ActivationOp(OpConstructContext *context) + : Operation(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", + 0.0f)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + const float *input_ptr = input->data(); + float *output_ptr = output->mutable_data(); + if (activation_ == PRELU) { + MACE_CHECK(this->InputSize() > 1); + const Tensor *alpha = this->Input(1); + const float *alpha_ptr = alpha->data(); + const index_t outer_size = output->dim(0); + const index_t inner_size = output->dim(2) * output->dim(3); + PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, + alpha_ptr, output_ptr); + } else { + DoActivation(input_ptr, output_ptr, output->size(), activation_, + relux_max_limit_); + } + return MaceStatus::MACE_SUCCESS; + } + + private: + ActivationType activation_; + float relux_max_limit_; +}; + + +#ifdef MACE_ENABLE_OPENCL +template +class ActivationOp : public Operation { + public: + explicit ActivationOp(OpConstructContext *context) + : Operation(context) { + ActivationType type = kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP")); + auto relux_max_limit = static_cast( + Operation::GetOptionalArg("max_limit", 0.0f)); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset( + new opencl::image::ActivationKernel(type, relux_max_limit)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *alpha = this->InputSize() > 1 ? this->Input(1) : nullptr; + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + return kernel_->Compute(context, input, alpha, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterActivation(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 66ec407fd85e9c3cefbbba1b4b235ba0c9859712..127284656039e658c5d3aad9aecd21af08cb89c5 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -17,15 +17,11 @@ #include #include -#include #include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" #include "mace/core/types.h" -#include "mace/kernels/kernel.h" #include "mace/kernels/arm/activation_neon.h" +#include "mace/utils/logging.h" namespace mace { namespace kernels { @@ -153,73 +149,6 @@ void PReLUActivation(const T *input_ptr, } } -template -class ActivationFunctor; - -template <> -class ActivationFunctor : OpKernel { - public: - ActivationFunctor(OpKernelContext *context, - ActivationType type, - float relux_max_limit) - : OpKernel(context), - activation_(type), - relux_max_limit_(relux_max_limit) {} - - MaceStatus operator()(const Tensor *input, - const Tensor *alpha, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - const float *input_ptr = input->data(); - float *output_ptr = output->mutable_data(); - if (activation_ == PRELU) { - MACE_CHECK_NOTNULL(alpha); - const float *alpha_ptr = alpha->data(); - const index_t outer_size = output->dim(0); - const index_t inner_size = output->dim(2) * output->dim(3); - PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, - alpha_ptr, output_ptr); - } else { - DoActivation(input_ptr, output_ptr, output->size(), activation_, - relux_max_limit_); - } - return MACE_SUCCESS; - } - - private: - ActivationType activation_; - float relux_max_limit_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLActivationKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *alpha, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel); -}; -template -class ActivationFunctor : OpKernel { - public: - ActivationFunctor(OpKernelContext *context, - ActivationType type, - T relux_max_limit); - - MaceStatus operator()(const Tensor *input, - const Tensor *alpha, - Tensor *output, - StatsFuture *future); - - private: - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - } // namespace kernels } // namespace mace diff --git a/mace/kernels/addn.cc b/mace/kernels/addn.cc new file mode 100644 index 0000000000000000000000000000000000000000..6634e8e8ffd1993dc9eb277d7fe7a4b180841928 --- /dev/null +++ b/mace/kernels/addn.cc @@ -0,0 +1,146 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include +#include + +#include "mace/core/operator.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/addn.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +static constexpr int kCostPerGroup = 1024; + +template +class AddNOp; + +template <> +class AddNOp : public Operation { + public: + explicit AddNOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + Tensor *output_tensor = this->Output(0); + size_t input_size = this->inputs_.size(); + MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(inputs_[0])); + index_t size = output_tensor->size(); + Tensor::MappingGuard output_map(output_tensor); + float *output_data = output_tensor->mutable_data(); + memset(output_data, 0, size * sizeof(float)); + int64_t cost = size * input_size; + int64_t groups = 1; + if (cost > kCostPerGroup) { + groups = cost / kCostPerGroup; + } + int64_t element_per_group = size / groups; + + std::vector mappers; + for (size_t i = 0; i < input_size; ++i) { + MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size()); + MACE_CHECK(inputs_[0]->size() == inputs_[i]->size()) + << "Input 0: " << MakeString(inputs_[0]->shape()) + << ", size: " << inputs_[0]->size() << ". Input " << i << ": " + << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size(); + mappers.emplace_back(Tensor::MappingGuard(inputs_[i])); + } + +#pragma omp parallel for + for (int64_t i = 0; i < size; i += element_per_group) { + int64_t count = std::min(element_per_group, size - i); + int nn = count >> 2; + int remain = count - (nn << 2); + for (size_t j = 0; j < input_size; ++j) { + const float *input_data = inputs_[j]->data(); + const float *input_ptr = input_data + i; + float *output_ptr = output_data + i; + for (int k = 0; k < nn; ++k) { +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + float32x4_t in = vld1q_f32(input_ptr); + float32x4_t out = vld1q_f32(output_ptr); + out = vaddq_f32(out, in); + vst1q_f32(output_ptr, out); +#else + for (int m = 0; m < 4; ++m) { + output_ptr[m] += input_ptr[m]; + } +#endif + + input_ptr += 4; + output_ptr += 4; + } + for (int k = 0; k < remain; ++k) { + *output_ptr += *input_ptr; + ++input_ptr; + ++output_ptr; + } + } + } + return MaceStatus::MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +class AddNOp : public Operation { + public: + explicit AddNOp(OpConstructContext *context) + : Operation(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::AddNKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + Tensor *output_tensor = this->Output(0); + size_t n = this->inputs_.size(); + for (size_t i = 1; i < n; ++i) { + MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size()); + MACE_CHECK(inputs_[0]->size() == inputs_[i]->size()) + << "Input 0: " << MakeString(inputs_[0]->shape()) + << ", size: " << inputs_[0]->size() << ". Input " << i << ": " + << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size(); + } + + return kernel_->Compute(context, inputs_, output_tensor); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterAddN(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h deleted file mode 100644 index 2fa3e21a91c48782cbcb73de4326731f6b656671..0000000000000000000000000000000000000000 --- a/mace/kernels/addn.h +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_ADDN_H_ -#define MACE_KERNELS_ADDN_H_ - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" - -namespace mace { -namespace kernels { - -constexpr int kCostPerGroup = 1024; - -template -struct AddNFunctor : OpKernel { - explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) { - MACE_UNUSED(future); - MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0])); - index_t size = output_tensor->size(); - Tensor::MappingGuard output_map(output_tensor); - float *output_data = output_tensor->mutable_data(); - memset(output_data, 0, size * sizeof(float)); - int n = input_tensors.size(); - int64_t cost = size * n; - int64_t groups = 1; - if (cost > kCostPerGroup) { - groups = cost / kCostPerGroup; - } - int64_t element_per_group = size / groups; - - std::vector mappers; - for (int64_t i = 0; i < n; ++i) { - mappers.emplace_back(Tensor::MappingGuard(input_tensors[i])); - } - -#pragma omp parallel for - for (int64_t i = 0; i < size; i += element_per_group) { - int64_t count = std::min(element_per_group, size - i); - int nn = count >> 2; - int remain = count - (nn << 2); - for (int64_t j = 0; j < n; ++j) { - const float *input_data = input_tensors[j]->data(); - const float *input_ptr = input_data + i; - float *output_ptr = output_data + i; - for (int k = 0; k < nn; ++k) { -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) - float32x4_t in = vld1q_f32(input_ptr); - float32x4_t out = vld1q_f32(output_ptr); - out = vaddq_f32(out, in); - vst1q_f32(output_ptr, out); -#else - for (int m = 0; m < 4; ++m) { - output_ptr[m] += input_ptr[m]; - } -#endif - - input_ptr += 4; - output_ptr += 4; - } - for (int k = 0; k < remain; ++k) { - *output_ptr += *input_ptr; - ++input_ptr; - ++output_ptr; - } - } - } - return MACE_SUCCESS; - } -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLAddNKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel); -}; -template -struct AddNFunctor : OpKernel { - explicit AddNFunctor(OpKernelContext *context); - MaceStatus operator()(const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_ADDN_H_ diff --git a/mace/kernels/argmax.h b/mace/kernels/argmax.cc similarity index 77% rename from mace/kernels/argmax.h rename to mace/kernels/argmax.cc index 36218d627fce5f220cd89120728e73887155fb16..19d52f7fb1980b94c8fb7b22b5a9855e70dbcd73 100644 --- a/mace/kernels/argmax.h +++ b/mace/kernels/argmax.cc @@ -12,32 +12,28 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_ARGMAX_H_ -#define MACE_KERNELS_ARGMAX_H_ - #include #include #include #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { -template -struct ArgMaxFunctor : OpKernel { - explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - const Tensor *axis, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template +class ArgMaxOp : public Operation { + public: + explicit ArgMaxOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + const Tensor *axis = this->Input(1); + Tensor *output = this->Output(0); MACE_CHECK(input->dim_size() > 0, "ArgMax input should not be a scalar"); MACE_CHECK(axis->dim_size() == 0, "Mace argmax only supports scalar axis"); @@ -77,11 +73,16 @@ struct ArgMaxFunctor : OpKernel { output_data[i] = idx; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; + + +void RegisterArgMax(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp, + DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_ARGMAX_H_ diff --git a/mace/kernels/batch_norm.cc b/mace/kernels/batch_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..b07f2f43d0100ced6b744822db8869a17f043eca --- /dev/null +++ b/mace/kernels/batch_norm.cc @@ -0,0 +1,209 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "mace/core/operator.h" +#include "mace/kernels/activation.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/batch_norm.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +template +class BatchNormOp; + +template <> +class BatchNormOp : public Operation { + public: + explicit BatchNormOp(OpConstructContext *context) + : Operation(context), + epsilon_(Operation::GetOptionalArg("epsilon", + static_cast(1e-4))), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + bool not_folded = this->InputSize() == 5; + const Tensor *input = this->Input(INPUT); + const Tensor *scale = this->Input(SCALE); + const Tensor *offset = this->Input(OFFSET); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", + input->dim_size()); + MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", + scale->dim_size()); + MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", + offset->dim_size()); + + Tensor *output = this->Output(OUTPUT); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . + // The calculation formula for inference is + // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + + // ( \offset - \frac { \scale * mean } { + // \sqrt{var+\variance_epsilon} } + // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } + // new_offset = \offset - mean * common_val; + // Y = new_scale * X + new_offset; + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t height = input->dim(2); + const index_t width = input->dim(3); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard scale_mapper(scale); + Tensor::MappingGuard offset_mapper(offset); + Tensor::MappingGuard output_mapper(output); + + const float *input_ptr = input->data(); + const float *scale_ptr = scale->data(); + const float *offset_ptr = offset->data(); + float *output_ptr = output->mutable_data(); + + std::vector new_scale; + std::vector new_offset; + if (not_folded) { + const Tensor *mean = this->Input(MEAN); + const Tensor *var = this->Input(VAR); + MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", + mean->dim_size()); + MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", + var->dim_size()); + new_scale.resize(channels); + new_offset.resize(channels); + Tensor::MappingGuard mean_mapper(mean); + Tensor::MappingGuard var_mapper(var); + const float *mean_ptr = mean->data(); + const float *var_ptr = var->data(); +#pragma omp parallel for + for (index_t c = 0; c < channels; ++c) { + new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_); + new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + } + } + + const float *scale_data = not_folded ? new_scale.data() : scale_ptr; + const float + *offset_data = not_folded ? new_offset.data() : offset_ptr; + + index_t channel_size = height * width; + index_t batch_size = channels * channel_size; + + // NEON is slower, so stick to the trivial implementaion +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + index_t offset = b * batch_size + c * channel_size; + for (index_t hw = 0; hw < height * width; ++hw) { + output_ptr[offset + hw] = + scale_data[c] * input_ptr[offset + hw] + offset_data[c]; + } + } + } + DoActivation(output_ptr, output_ptr, output->size(), activation_, + relux_max_limit_); + + return MaceStatus::MACE_SUCCESS; + } + + private: + float epsilon_; + const ActivationType activation_; + const float relux_max_limit_; + + protected: + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + + +#ifdef MACE_ENABLE_OPENCL +template +class BatchNormOp : public Operation { + public: + explicit BatchNormOp(OpConstructContext *context) + : Operation(context) { + float epsilon = Operation::GetOptionalArg( + "epsilon", static_cast(1e-4)); + ActivationType activation = kernels::StringToActivationType( + Operation::GetOptionalArg("activation", "NOOP")); + float relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BatchNormKernel( + epsilon, activation, relux_max_limit)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + bool not_folded = this->InputSize() == 5; + const Tensor *input = this->Input(INPUT); + const Tensor *scale = this->Input(SCALE); + const Tensor *offset = this->Input(OFFSET); + const Tensor *mean = not_folded ? this->Input(MEAN) : nullptr; + const Tensor *var = not_folded ? this->Input(VAR) : nullptr; + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", + input->dim_size()); + MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", + scale->dim_size()); + MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", + offset->dim_size()); + if (not_folded) { + MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", + mean->dim_size()); + MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", + var->dim_size()); + } + + Tensor *output = this->Output(OUTPUT); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + return kernel_->Compute(context, input, scale, offset, mean, + var, output); + } + + private: + std::unique_ptr kernel_; + + protected: + MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterBatchNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h deleted file mode 100644 index 75e58f937ddca72241648ae52d4df8f079bf3f39..0000000000000000000000000000000000000000 --- a/mace/kernels/batch_norm.h +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_BATCH_NORM_H_ -#define MACE_KERNELS_BATCH_NORM_H_ - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/activation.h" -#include "mace/public/mace.h" - -namespace mace { -namespace kernels { - -template -struct BatchNormFunctor; - -template<> -struct BatchNormFunctor : OpKernel { - BatchNormFunctor(OpKernelContext *context, - const bool folded_constant, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - folded_constant_(folded_constant), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - MaceStatus operator()(const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . - // The calculation formula for inference is - // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + - // ( \offset - \frac { \scale * mean } { - // \sqrt{var+\variance_epsilon} } - // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } - // new_offset = \offset - mean * common_val; - // Y = new_scale * X + new_offset; - const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t height = input->dim(2); - const index_t width = input->dim(3); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard scale_mapper(scale); - Tensor::MappingGuard offset_mapper(offset); - Tensor::MappingGuard output_mapper(output); - - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *offset_ptr = offset->data(); - float *output_ptr = output->mutable_data(); - - std::vector new_scale; - std::vector new_offset; - if (!folded_constant_) { - new_scale.resize(channels); - new_offset.resize(channels); - Tensor::MappingGuard mean_mapper(mean); - Tensor::MappingGuard var_mapper(var); - const float *mean_ptr = mean->data(); - const float *var_ptr = var->data(); -#pragma omp parallel for - for (index_t c = 0; c < channels; ++c) { - new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon); - new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; - } - } - - const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data(); - const float - *offset_data = folded_constant_ ? offset_ptr : new_offset.data(); - - index_t channel_size = height * width; - index_t batch_size = channels * channel_size; - - // NEON is slower, so stick to the trivial implementaion -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - index_t offset = b * batch_size + c * channel_size; - for (index_t hw = 0; hw < height * width; ++hw) { - output_ptr[offset + hw] = - scale_data[c] * input_ptr[offset + hw] + offset_data[c]; - } - } - } - DoActivation(output_ptr, output_ptr, output->size(), activation_, - relux_max_limit_); - - return MACE_SUCCESS; - } - - const bool folded_constant_; - const ActivationType activation_; - const float relux_max_limit_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLBatchNormKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel); -}; -template -struct BatchNormFunctor : OpKernel { - BatchNormFunctor(OpKernelContext *context, - const bool folded_constant, - const ActivationType activation, - const float relux_max_limit); - MaceStatus operator()(const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future); - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_BATCH_NORM_H_ diff --git a/mace/kernels/batch_to_space.h b/mace/kernels/batch_to_space.cc similarity index 77% rename from mace/kernels/batch_to_space.h rename to mace/kernels/batch_to_space.cc index 45b2ff8850f8cedef493fac1da575ec594d482be..5df98aefb0172f81ca8a1b48eb899c2a929718bb 100644 --- a/mace/kernels/batch_to_space.h +++ b/mace/kernels/batch_to_space.cc @@ -12,34 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_ -#define MACE_KERNELS_BATCH_TO_SPACE_H_ - -#include -#include #include +#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/batch_to_space.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -struct BatchToSpaceFunctorBase : OpKernel { - BatchToSpaceFunctorBase(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : OpKernel(context), - paddings_(paddings.begin(), paddings.end()), - block_shape_(block_shape.begin(), block_shape.end()) { +class BatchToSpaceOpBase : public Operation { + public: + explicit BatchToSpaceOpBase(OpConstructContext *context) + : Operation(context), + paddings_(Operation::GetRepeatedArgs("crops", {0, 0, 0, 0})), + block_shape_(Operation::GetRepeatedArgs("block_shape", {1, 1})) { MACE_CHECK( - block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, + block_shape_.size() == 2 && block_shape_[0] > 1 && block_shape_[1] > 1, "Block's shape should be 1D, and greater than 1"); - MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); + MACE_CHECK(paddings_.size() == 4, "Paddings' shape should be 2D"); } + protected: std::vector paddings_; std::vector block_shape_; @@ -83,21 +79,19 @@ struct BatchToSpaceFunctorBase : OpKernel { } }; -template -struct BatchToSpaceFunctor; - -template<> -struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { - BatchToSpaceFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : BatchToSpaceFunctorBase(context, paddings, block_shape) {} +template +class BatchToSpaceNDOp; - MaceStatus operator()(const Tensor *batch_tensor, - Tensor *space_tensor, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class BatchToSpaceNDOp : public BatchToSpaceOpBase { + public: + explicit BatchToSpaceNDOp(OpConstructContext *context) + : BatchToSpaceOpBase(context) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *batch_tensor = this->Input(0); + Tensor *space_tensor = this->Output(0); std::vector output_shape(4, 0); CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NCHW, @@ -177,24 +171,21 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { } // block_h } // c - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; -template<> -struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { - BatchToSpaceFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : BatchToSpaceFunctorBase(context, paddings, block_shape) {} - - MaceStatus operator()(const Tensor *batch_tensor, - Tensor *space_tensor, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class BatchToSpaceNDOp : public BatchToSpaceOpBase { + public: + explicit BatchToSpaceNDOp(OpConstructContext *context) + : BatchToSpaceOpBase(context) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *batch_tensor = this->Input(0); + Tensor *space_tensor = this->Output(0); std::vector output_shape(4, 0); - CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, output_shape.data()); @@ -264,38 +255,53 @@ struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { } // h } // b - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; #ifdef MACE_ENABLE_OPENCL -class OpenCLBatchToSpaceKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *batch_tensor, - const std::vector &paddings, - const std::vector &block_shape, - const std::vector &output_shape, - Tensor *space_tensor, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel); -}; template -struct BatchToSpaceFunctor : BatchToSpaceFunctorBase { - BatchToSpaceFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape); - - MaceStatus operator()(const Tensor *batch_tensor, - Tensor *space_tensor, - StatsFuture *future); +class BatchToSpaceNDOp : public BatchToSpaceOpBase { + public: + explicit BatchToSpaceNDOp(OpConstructContext *context) + : BatchToSpaceOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BatchToSpaceKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *batch_tensor = this->Input(0); + Tensor *space_tensor = this->Output(0); + std::vector output_shape(4, 0); + CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, + output_shape.data()); + return kernel_->Compute(context, batch_tensor, paddings_, block_shape_, + output_shape, space_tensor); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterBatchToSpaceND(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "BatchToSpaceND", + BatchToSpaceNDOp, DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "BatchToSpaceND", + BatchToSpaceNDOp, DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "BatchToSpaceND", + BatchToSpaceNDOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "BatchToSpaceND", + BatchToSpaceNDOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.cc similarity index 50% rename from mace/kernels/bias_add.h rename to mace/kernels/bias_add.cc index d58a4d93e26a6f10c324bb21f0e3dca9c111ec3a..fc8b7374c3c1228a99844c0a4a5d40e747cb1d13 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.cc @@ -12,43 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_BIAS_ADD_H_ -#define MACE_KERNELS_BIAS_ADD_H_ - #include #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" +#include "mace/kernels/activation.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/bias_add.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -struct BiasAddFunctorBase : OpKernel { - BiasAddFunctorBase(OpKernelContext *context, - const DataFormat data_format) - : OpKernel(context), data_format_(data_format) {} +template +class BiasAddOp; - DataFormat data_format_; -}; +template <> +class BiasAddOp : public Operation { + public: + explicit BiasAddOp(OpConstructContext *context) + : Operation(context), + data_format_(static_cast(Operation::GetOptionalArg( + "data_format", NHWC))) {} -template -struct BiasAddFunctor; + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + const Tensor *bias = this->Input(1); -template <> -struct BiasAddFunctor : BiasAddFunctorBase { - BiasAddFunctor(OpKernelContext *context, - const DataFormat data_format) - : BiasAddFunctorBase(context, data_format) {} + MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ", + bias->dim_size()); - MaceStatus operator()(const Tensor *input, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard bias_mapper(bias); @@ -87,35 +84,60 @@ struct BiasAddFunctor : BiasAddFunctorBase { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } -}; -#ifdef MACE_ENABLE_OPENCL -class OpenCLBiasAddKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *bias, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel); + private: + DataFormat data_format_; }; +#ifdef MACE_ENABLE_OPENCL template -struct BiasAddFunctor : BiasAddFunctorBase { - BiasAddFunctor(OpKernelContext *context, const DataFormat data_format); - MaceStatus operator()(const Tensor *input, - const Tensor *bias, - Tensor *output, - StatsFuture *future); +class BiasAddOp : public Operation { + public: + explicit BiasAddOp(OpConstructContext *context) + : Operation(context), + data_format_(static_cast(Operation::GetOptionalArg( + "data_format", NHWC))) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BiasAddKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *bias = this->Input(1); + + MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ", + bias->dim_size()); + + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, + "gpu only support biasadd for 4-dimensional NHWC format tensor"); + return kernel_->Compute(context, input, bias, output); + } + private: + DataFormat data_format_; std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterBiasAdd(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_BIAS_ADD_H_ diff --git a/mace/kernels/buffer_inverse_transform.cc b/mace/kernels/buffer_inverse_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..b447334cb3bc0daf7081b562281ce2acbdfb3ee0 --- /dev/null +++ b/mace/kernels/buffer_inverse_transform.cc @@ -0,0 +1,67 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h" +#include "mace/kernels/opencl/image/image_to_buffer.h" + +namespace mace { +namespace kernels { + +template +class BufferInverseTransformOp; + +template +class BufferInverseTransformOp : public Operation { + public: + explicit BufferInverseTransformOp(OpConstructContext *context) + : Operation(context), + wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ImageToBuffer); + } else { + kernel_.reset(new opencl::buffer::BufferInverseTransform); + } + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + kernels::BufferType type = + static_cast(Operation::GetOptionalArg( + "buffer_type", static_cast(kernels::CONV2D_FILTER))); + + return kernel_->Compute(context, input, type, + wino_blk_size_, output); + } + + private: + const int wino_blk_size_; + std::unique_ptr kernel_; +}; + + +void RegisterBufferInverseTransform(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "BufferInverseTransform", + BufferInverseTransformOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "BufferInverseTransform", + BufferInverseTransformOp, DeviceType::GPU, half); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/buffer_inverse_transform.h b/mace/kernels/buffer_inverse_transform.h deleted file mode 100644 index 2b3e0098cc243c0b14e66e91e8b3e128d6e863a4..0000000000000000000000000000000000000000 --- a/mace/kernels/buffer_inverse_transform.h +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ -#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/kernels/opencl/common.h" - -namespace mace { -namespace kernels { - -struct BufferInverseTransformFunctorBase : OpKernel { - BufferInverseTransformFunctorBase(OpKernelContext *context, - const int wino_blk_size) - : OpKernel(context), - wino_blk_size_(wino_blk_size) {} - const int wino_blk_size_; -}; - -template -struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase { - explicit BufferInverseTransformFunctor(OpKernelContext *context, - const int wino_blk_size) - : BufferInverseTransformFunctorBase(context, wino_blk_size) {} - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(input); - MACE_UNUSED(type); - MACE_UNUSED(output); - MACE_UNUSED(future); - MACE_NOT_IMPLEMENTED; - return MACE_SUCCESS; - } -}; - -class OpenCLBufferInverseTransformKernel { - public: - virtual MaceStatus Compute(OpKernelContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel) -}; - -template -struct BufferInverseTransformFunctor - : BufferInverseTransformFunctorBase { - explicit BufferInverseTransformFunctor(OpKernelContext *context, - const int wino_blk_size); - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/kernels/buffer_transform.cc b/mace/kernels/buffer_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b14698cdd36ac20be33c7a6e1c1ddcc3a27dc75 --- /dev/null +++ b/mace/kernels/buffer_transform.cc @@ -0,0 +1,67 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/opencl/buffer/buffer_transform.h" +#include "mace/kernels/opencl/image/buffer_to_image.h" + +namespace mace { +namespace kernels { + +template +class BufferTransformOp; + +template +class BufferTransformOp : public Operation { + public: + explicit BufferTransformOp(OpConstructContext *context) + : Operation(context), + wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::BufferToImage); + } else { + kernel_.reset(new opencl::buffer::BufferTransform); + } + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + kernels::BufferType type = + static_cast(Operation::GetOptionalArg( + "buffer_type", static_cast(kernels::CONV2D_FILTER))); + + return kernel_->Compute(context, input, type, + wino_blk_size_, output); + } + + private: + const int wino_blk_size_; + std::unique_ptr kernel_; +}; + + +void RegisterBufferTransform(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "BufferTransform", + BufferTransformOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "BufferTransform", + BufferTransformOp, DeviceType::GPU, half); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/buffer_transform.h b/mace/kernels/buffer_transform.h deleted file mode 100644 index 8f0fd039c0b68be20373e58d7e465df3d870180b..0000000000000000000000000000000000000000 --- a/mace/kernels/buffer_transform.h +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_ -#define MACE_KERNELS_BUFFER_TRANSFORM_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/kernels/opencl/common.h" - -namespace mace { -namespace kernels { - -struct BufferTransformFunctorBase : OpKernel { - explicit BufferTransformFunctorBase(OpKernelContext *context, - const int wino_blk_size) - : OpKernel(context), wino_blk_size_(wino_blk_size) {} - const int wino_blk_size_; -}; - -template -struct BufferTransformFunctor : BufferTransformFunctorBase { - BufferTransformFunctor(OpKernelContext *context, - const int wino_blk_size) - : BufferTransformFunctorBase(context, wino_blk_size) {} - - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(input); - MACE_UNUSED(type); - MACE_UNUSED(output); - MACE_UNUSED(future); - MACE_NOT_IMPLEMENTED; - return MACE_SUCCESS; - } -}; - -class OpenCLBufferTransformKernel { - public: - virtual MaceStatus Compute(OpKernelContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel) -}; - -template -struct BufferTransformFunctor : BufferTransformFunctorBase { - BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size); - - MaceStatus operator()(const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_BUFFER_TRANSFORM_H_ diff --git a/mace/ops/cast.h b/mace/kernels/cast.cc similarity index 74% rename from mace/ops/cast.h rename to mace/kernels/cast.cc index 56d20d52cb97952476b46c993bba6024f59109c2..0bd971e1f051c7aadd716f213e2d8d09b2d0e9f9 100644 --- a/mace/ops/cast.h +++ b/mace/kernels/cast.cc @@ -12,24 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_CAST_H_ -#define MACE_OPS_CAST_H_ - -#include - #include "mace/core/operator.h" namespace mace { -namespace ops { +namespace kernels { template -class CastOp : public Operator { +class CastOp : public Operation { public: - CastOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context) {} + explicit CastOp(OpConstructContext *context) + : Operation(context) {} - MaceStatus Run(StatsFuture *future) override { - MACE_UNUSED(future); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); MACE_RETURN_IF_ERROR(output->ResizeLike(input)) @@ -47,7 +42,7 @@ class CastOp : public Operator { MACE_RUN_WITH_TYPE_ENUM(dst_dtype, MACE_CAST_COPY); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } private: @@ -55,7 +50,12 @@ class CastOp : public Operator { MACE_OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterCast(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Cast", CastOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Cast", CastOp, + DeviceType::CPU, int32_t); +} -#endif // MACE_OPS_CAST_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.cc similarity index 50% rename from mace/kernels/channel_shuffle.h rename to mace/kernels/channel_shuffle.cc index d5cf5fe0bb746918115e271f4b471cab3c1ec8b1..8258ea1cb0eccc1893baf7d54b3cd795fbb5c40f 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.cc @@ -12,28 +12,33 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_ -#define MACE_KERNELS_CHANNEL_SHUFFLE_H_ - #include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/channel_shuffle.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct ChannelShuffleFunctor : OpKernel { - ChannelShuffleFunctor(OpKernelContext *context, const int groups) - : OpKernel(context), groups_(groups) {} +template +class ChannelShuffleOp; - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template +class ChannelShuffleOp : public Operation { + public: + explicit ChannelShuffleOp(OpConstructContext *context) + : Operation(context), + groups_(Operation::GetOptionalArg("group", 1)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim(1) % groups_ == 0, + "input channels must be an integral multiple of group. ", + input->dim(1)); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard logits_guard(input); @@ -64,35 +69,51 @@ struct ChannelShuffleFunctor : OpKernel { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: const int groups_; }; + #ifdef MACE_ENABLE_OPENCL -class OpenCLChannelShuffleKernel { +template +class ChannelShuffleOp : public Operation { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel); -}; -template -struct ChannelShuffleFunctor : OpKernel { - ChannelShuffleFunctor(OpKernelContext *context, const int groups); - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); + explicit ChannelShuffleOp(OpConstructContext *context) + : Operation(context) { + const int groups = Operation::GetOptionalArg("group", 1); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ChannelShuffleKernel(groups)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + return kernel_->Compute(context, input, output); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterChannelShuffle(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ChannelShuffle", + ChannelShuffleOp, DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "ChannelShuffle", + ChannelShuffleOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "ChannelShuffle", + ChannelShuffleOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_CHANNEL_SHUFFLE_H_ diff --git a/mace/kernels/concat.h b/mace/kernels/concat.cc similarity index 54% rename from mace/kernels/concat.h rename to mace/kernels/concat.cc index 0cb28861648b2221c41e6225afe0743ab36e5d9b..de50119210bc6886972699511ba8021fc994c581 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.cc @@ -12,33 +12,54 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_CONCAT_H_ -#define MACE_KERNELS_CONCAT_H_ - #include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/core/types.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" #include "mace/utils/quantize.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/concat.h" +#endif // MACE_ENABLE_OPENCL + namespace mace { namespace kernels { -template -struct ConcatFunctor : OpKernel { - ConcatFunctor(OpKernelContext *context, const int32_t axis) - : OpKernel(context), axis_(axis) {} +class ConcatOpBase : public Operation { + public: + explicit ConcatOpBase(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 3)) {} + + protected: + void Validate() { + const int32_t input_dims = this->Input(0)->dim_size(); + axis_ = + axis_ < 0 ? axis_ + input_dims : axis_; + MACE_CHECK((0 <= axis_ && axis_ < input_dims), + "Expected concatenating axis in the range [", -input_dims, ", ", + input_dims, "], but got ", axis_); + } + + protected: + int axis_; +}; + +template +class ConcatOp; + +template +class ConcatOp : public ConcatOpBase { + public: + explicit ConcatOp(OpConstructContext *context) + : ConcatOpBase(context) {} - MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - const Tensor *input0 = input_list.front(); - const size_t inputs_count = input_list.size(); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + Validate(); + const std::vector &inputs = this->Inputs(); + Tensor *output = this->Output(0); + const Tensor *input0 = inputs.front(); + const size_t inputs_count = inputs.size(); std::vector output_shape(input0->shape()); index_t inner_size = 1; @@ -48,7 +69,7 @@ struct ConcatFunctor : OpKernel { std::vector outer_sizes(inputs_count, 0); outer_sizes[0] = input0->size() / inner_size; for (size_t i = 1; i < inputs_count; ++i) { - const Tensor *input = input_list[i]; + const Tensor *input = inputs[i]; MACE_CHECK(input->dim_size() == input0->dim_size(), "Ranks of all input tensors must be same."); for (int j = 0; j < input->dim_size(); ++j) { @@ -65,9 +86,9 @@ struct ConcatFunctor : OpKernel { T *output_ptr = output->mutable_data(); - std::vector input_ptrs(input_list.size(), nullptr); + std::vector input_ptrs(inputs.size(), nullptr); for (size_t i = 0; i < inputs_count; ++i) { - input_ptrs[i] = input_list[i]->data(); + input_ptrs[i] = inputs[i]->data(); } for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) { for (size_t i = 0; i < inputs_count; ++i) { @@ -83,24 +104,24 @@ struct ConcatFunctor : OpKernel { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } - - int32_t axis_; }; -template<> -struct ConcatFunctor : OpKernel { - ConcatFunctor(OpKernelContext *context, const int32_t axis) - : OpKernel(context), axis_(axis) {} - - MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class ConcatOp : public ConcatOpBase { + public: + explicit ConcatOp(OpConstructContext *context) + : ConcatOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + Validate(); + const std::vector &inputs = this->Inputs(); + Tensor *output = this->Output(0); MACE_CHECK(output->scale() != 0); - const Tensor *input0 = input_list.front(); - const size_t inputs_count = input_list.size(); + const Tensor *input0 = inputs.front(); + const size_t inputs_count = inputs.size(); std::vector output_shape(input0->shape()); index_t inner_size = 1; @@ -110,7 +131,7 @@ struct ConcatFunctor : OpKernel { std::vector outer_sizes(inputs_count, 0); outer_sizes[0] = input0->size() / inner_size; for (size_t i = 1; i < inputs_count; ++i) { - const Tensor *input = input_list[i]; + const Tensor *input = inputs[i]; MACE_CHECK(input->dim_size() == input0->dim_size(), "Ranks of all input tensors must be same."); for (int j = 0; j < input->dim_size(); ++j) { @@ -127,22 +148,22 @@ struct ConcatFunctor : OpKernel { auto output_ptr = output->mutable_data(); - std::vector input_ptrs(input_list.size(), nullptr); + std::vector input_ptrs(inputs.size(), nullptr); for (size_t i = 0; i < inputs_count; ++i) { - input_ptrs[i] = input_list[i]->data(); + input_ptrs[i] = inputs[i]->data(); } for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) { for (size_t i = 0; i < inputs_count; ++i) { - if (input_list[i]->zero_point() == output->zero_point() - && input_list[i]->scale() == output->scale()) { + if (inputs[i]->zero_point() == output->zero_point() + && inputs[i]->scale() == output->scale()) { memcpy(output_ptr, input_ptrs[i], outer_sizes[i] * sizeof(uint8_t)); output_ptr += outer_sizes[i]; input_ptrs[i] += outer_sizes[i]; } else { - const float scale = input_list[i]->scale() / output->scale(); + const float scale = inputs[i]->scale() / output->scale(); const float offset = - -input_list[i]->zero_point() * scale + output->zero_point(); + -inputs[i]->zero_point() * scale + output->zero_point(); for (index_t k = 0; k < outer_sizes[i]; ++k) { float out = (*input_ptrs[i]) * scale + offset; *output_ptr = Saturate(roundf(out)); @@ -153,35 +174,49 @@ struct ConcatFunctor : OpKernel { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } - - int32_t axis_; }; #ifdef MACE_ENABLE_OPENCL -class OpenCLConcatKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const std::vector &input_list, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel); -}; template -struct ConcatFunctor : OpKernel { - ConcatFunctor(OpKernelContext *context, const int32_t axis); - - MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future); +class ConcatOp : public ConcatOpBase { + public: + explicit ConcatOp(OpConstructContext *context) + : ConcatOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ConcatKernel(axis_)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + Validate(); + Tensor *output = this->Output(0); + return kernel_->Compute(context, inputs_, output); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterConcat(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, + DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Concat", ConcatOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_CONCAT_H_ diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.cc similarity index 83% rename from mace/kernels/conv_2d.h rename to mace/kernels/conv_2d.cc index ebd23576e205b01988b763f51a2b296d306c1ee7..c6edbff6b88fc89c9c82124b10944b2a4872754b 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.cc @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_CONV_2D_H_ -#define MACE_KERNELS_CONV_2D_H_ - #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #include #endif @@ -22,250 +19,47 @@ #include #include #include +#include #include #include #include "mace/core/future.h" +#include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/kernels/activation.h" -#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/arm/conv_2d_neon.h" #include "mace/kernels/arm/conv_winograd.h" +#include "mace/kernels/conv_pool_2d_base.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/gemmlowp_util.h" -#include "mace/kernels/quantize.h" #include "mace/utils/utils.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/conv_2d.h" +#include "mace/kernels/opencl/buffer/conv_2d.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -struct Conv2dFunctorBase : OpKernel { - Conv2dFunctorBase(OpKernelContext *context, - const int *strides, - const Padding &padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - strides_(strides), - padding_type_(padding_type), - paddings_(paddings), - dilations_(dilations), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - const int *strides_; // [stride_h, stride_w] - const Padding padding_type_; - std::vector paddings_; - const int *dilations_; // [dilation_h, dilation_w] - const ActivationType activation_; - const float relux_max_limit_; -}; - -template -struct Conv2dFunctor; - -template<> -struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding &padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : Conv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit), - is_filter_transformed_(false) {} - - void Conv2dGeneral(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const index_t *filter_shape, - const int *stride_hw, - const int *dilation_hw, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = filter_shape[1] * in_image_size; - const index_t out_batch_size = filter_shape[0] * out_image_size; - const index_t filter_size = filter_shape[2] * filter_shape[3]; - -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < in_shape[0]; b++) { - for (index_t m = 0; m < filter_shape[0]; m += 4) { - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - - const int stride_h = stride_hw[0]; - const int stride_w = stride_hw[1]; - const int dilation_h = dilation_hw[0]; - const int dilation_w = dilation_hw[1]; - if (m + 3 < out_channels) { - float *out_ptr0_base = - output + b * out_batch_size + m * out_image_size; - float *out_ptr1_base = out_ptr0_base + out_image_size; - float *out_ptr2_base = out_ptr1_base + out_image_size; - float *out_ptr3_base = out_ptr2_base + out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + m * in_channels * filter_size + c * filter_size; - const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; - const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; - const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (4 outch x 1 height x 4 width): vo_outch_height - float vo0[4], vo1[4], vo2[4], vo3[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - vo1[ow] = out_ptr1_base[out_offset + ow]; - vo2[ow] = out_ptr2_base[out_offset + ow]; - vo3[ow] = out_ptr3_base[out_offset + ow]; - } - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - // outch 1 - vo1[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr1[kw]; - vo1[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - vo1[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr1[kw]; - // outch 2 - vo2[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr2[kw]; - vo2[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - vo2[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr2[kw]; - // outch 3 - vo3[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr3[kw]; - vo3[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - vo3[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr3[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - filter_ptr1 += filter_shape[3]; - filter_ptr2 += filter_shape[3]; - filter_ptr3 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - out_ptr1_base[out_offset + ow] = vo1[ow]; - out_ptr2_base[out_offset + ow] = vo2[ow]; - out_ptr3_base[out_offset + ow] = vo3[ow]; - } - - filter_ptr0 -= filter_size; - filter_ptr1 -= filter_size; - filter_ptr2 -= filter_size; - filter_ptr3 -= filter_size; - } // w - } // h - } // c - } else { - for (index_t mm = m; mm < out_channels; ++mm) { - float *out_ptr0_base = - output + b * out_batch_size + mm * out_image_size; - for (index_t c = 0; c < in_channels; ++c) { - const float *in_ptr_base = - input + b * in_batch_size + c * in_image_size; - const float *filter_ptr0 = - filter + mm * in_channels * filter_size + c * filter_size; - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w + 3 < out_width; w += 4) { - // input offset - index_t ih = h * stride_h; - index_t iw = w * stride_w; - index_t in_offset = ih * in_width + iw; - // output (1 outch x 1 height x 4 width): vo_outch_height - float vo0[4]; - // load output - index_t out_offset = h * out_width + w; - for (index_t ow = 0; ow < 4; ++ow) { - vo0[ow] = out_ptr0_base[out_offset + ow]; - } - - // calc by row - for (index_t kh = 0; kh < filter_shape[2]; ++kh) { - for (index_t kw = 0; kw < filter_shape[3]; ++kw) { - // outch 0 - vo0[0] += in_ptr_base[in_offset - + kw * dilation_w] * filter_ptr0[kw]; - vo0[1] += in_ptr_base[in_offset + stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[2] += in_ptr_base[in_offset + 2 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - vo0[3] += in_ptr_base[in_offset + 3 * stride_w - + kw * dilation_w] * filter_ptr0[kw]; - } // kw - - in_offset += dilation_h * in_width; - filter_ptr0 += filter_shape[3]; - } // kh - - for (index_t ow = 0; ow < 4; ++ow) { - out_ptr0_base[out_offset + ow] = vo0[ow]; - } - filter_ptr0 -= filter_size; - } // w - } // h - } // c - } // mm - } // if - } // m - } // b - } +template +class Conv2dOp; - MaceStatus operator()(const Tensor *input, // NCHW - const Tensor *filter, // OIHW - const Tensor *bias, - Tensor *output, // NCHW - StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK_NOTNULL(input); - MACE_CHECK_NOTNULL(filter); - MACE_CHECK_NOTNULL(output); +template <> +class Conv2dOp : public ConvPool2dOpBase { + public: + explicit Conv2dOp(OpConstructContext *context) + : ConvPool2dOpBase(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), + is_filter_transformed_(false) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); std::vector filter_shape(4); filter_shape = filter->shape(); @@ -275,8 +69,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { if (paddings_.empty()) { CalcNCHWPaddingAndOutputSize(input->shape().data(), filter_shape.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); @@ -285,8 +79,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { CalcNCHWOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), RoundType::FLOOR, output_shape.data()); } @@ -340,15 +134,15 @@ struct Conv2dFunctor : Conv2dFunctorBase { std::function conv_func; bool - use_winograd = filter_h == 3 && filter_w == 3 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1 - && input_channels >= 8 && channels >= 8; + use_winograd = filter_h == 3 && filter_w == 3 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1 + && input_channels >= 8 && channels >= 8; bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3 - && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; + && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1 - && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7 @@ -380,7 +174,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { if (use_winograd) { extra_output_height = RoundUp(height, winograd_out_tile_size); extra_input_height = - std::max(padded_input_height, extra_output_height + 2); + std::max(padded_input_height, extra_output_height + 2); extra_output_width = RoundUp(width, winograd_out_tile_size); extra_input_width = std::max(padded_input_width, extra_output_width + 2); if (extra_input_height != padded_input_height) { @@ -394,7 +188,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { index_t tile_width_count = extra_output_width / winograd_out_tile_size; index_t tile_count = tile_height_count * tile_width_count; index_t in_tile_area = - (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2); + (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2); transformed_input_shape.insert(transformed_input_shape.end(), {in_tile_area, batch, input_channels, @@ -455,17 +249,17 @@ struct Conv2dFunctor : Conv2dFunctorBase { total_scratch_size += transformed_input_size + transformed_output_size; } if (extra_input_height != input_height - || extra_input_width != input_width) { + || extra_input_width != input_width) { padded_input_size = - batch * input_channels * (input_height + pad_top + pad_bottom) - * (input_width + pad_left + pad_right) * sizeof(float) + - MACE_EXTRA_BUFFER_PAD_SIZE; + batch * input_channels * (input_height + pad_top + pad_bottom) + * (input_width + pad_left + pad_right) * sizeof(float) + + MACE_EXTRA_BUFFER_PAD_SIZE; total_scratch_size += padded_input_size; } if (extra_output_height != height || extra_output_width != width) { padded_output_size = - batch * channels * extra_output_height * extra_output_width - * sizeof(float); + batch * channels * extra_output_height * extra_output_width + * sizeof(float); total_scratch_size += padded_output_size; } // scratch for sgemm, preoccupy enough buffer @@ -478,13 +272,13 @@ struct Conv2dFunctor : Conv2dFunctorBase { } // Init scratch buffer - ScratchBuffer *scratch = context_->device()->scratch_buffer(); + ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(total_scratch_size); Tensor - transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT); + transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT); Tensor - transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT); + transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT); Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT); Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT); const index_t extra_input_shape[4] = @@ -624,10 +418,10 @@ struct Conv2dFunctor : Conv2dFunctorBase { } else if (use_neon_1x15_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK1x15S1(pad_input, - filter_data, - extra_input_shape, - extra_output_shape, - pad_output); + filter_data, + extra_input_shape, + extra_output_shape, + pad_output); }; } else if (use_neon_15x1_s1) { conv_func = [=](const float *pad_input, float *pad_output) { @@ -644,8 +438,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { extra_input_shape, extra_output_shape, filter_shape.data(), - strides_, - dilations_, + strides_.data(), + dilations_.data(), pad_output); }; } @@ -653,13 +447,9 @@ struct Conv2dFunctor : Conv2dFunctorBase { // pad input and output const Tensor *pad_input_ptr = input; if (extra_input_height != input_height - || extra_input_width != input_width) { - MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input, - pad_top, - pad_bottom, - pad_left, - pad_right, - &padded_input)); + || extra_input_width != input_width) { + MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding( + input, pad_top, pad_bottom, pad_left, pad_right, &padded_input)); pad_input_ptr = &padded_input; } @@ -667,7 +457,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { Tensor *pad_output_ptr = output; if (extra_output_height != height || extra_output_width != width) { padded_output.Reshape({batch, channels, extra_output_height, - extra_output_width}); + extra_output_width}); padded_output.Clear(); pad_output_ptr = &padded_output; } else if (!use_neon_1x1_s1) { @@ -686,13 +476,13 @@ struct Conv2dFunctor : Conv2dFunctorBase { for (index_t c = 0; c < channels; ++c) { for (index_t h = 0; h < height; ++h) { memcpy( - output_data + b * channels * height * width + c * height * width - + h * width, - pad_output_data - + b * channels * extra_output_height * extra_output_width - + c * extra_output_height * extra_output_width - + h * extra_output_width, - sizeof(float) * width); + output_data + b * channels * height * width + c * height * width + + h * width, + pad_output_data + + b * channels * extra_output_height * extra_output_width + + c * extra_output_height * extra_output_width + + h * extra_output_width, + sizeof(float) * width); } } } @@ -727,123 +517,216 @@ struct Conv2dFunctor : Conv2dFunctorBase { DoActivation(output_data, output_data, output->size(), activation_, relux_max_limit_); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } - bool is_filter_transformed_; - SGemm sgemm_; -}; - -template<> -struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding &padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : Conv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) {} - - template - inline void Im2col( - const T *in_data, const std::vector &in_shape, - const index_t filter_h, const index_t filter_w, const index_t stride_h, - const index_t stride_w, const T zero_point, const int pad_height, - const int pad_width, const std::vector &out_shape, - const index_t depth, T* im2col_data) { - const index_t input_row_size = in_shape[2] * in_shape[3]; - const index_t patch_row_size = filter_w * in_shape[3]; - -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - // Reshape a patch of input to column, which is corresponding to - // a column of output(:, column). - const index_t ih_begin = h * stride_h - (pad_height >> 1); - const index_t ih_end = ih_begin + filter_h; - const index_t iw_begin = w * stride_w - (pad_width >> 1); - const index_t iw_end = iw_begin + filter_w; - // gate height and width to separate padding - const index_t ih_begin_gated = std::max(0, ih_begin); - const index_t ih_end_gated = std::min(ih_end, in_shape[1]); - const index_t iw_begin_gated = std::max(0, iw_begin); - const index_t iw_end_gated = std::min(iw_end, in_shape[2]); - const index_t pad_top = std::max(0, -ih_begin); - const index_t pad_bottom = ih_end - ih_end_gated; - const index_t pad_left = std::max(0, -iw_begin); - const index_t pad_right = iw_end - iw_end_gated; - index_t im2col_column_offset = - ((b * out_shape[1] + h) * out_shape[2] + w) * depth; + private: + void Conv2dGeneral(const float *input, + const float *filter, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int *stride_hw, + const int *dilation_hw, + float *output) { + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = filter_shape[1] * in_image_size; + const index_t out_batch_size = filter_shape[0] * out_image_size; + const index_t filter_size = filter_shape[2] * filter_shape[3]; - // fill in padding top - if (pad_top > 0) { - std::fill_n(im2col_data + im2col_column_offset, - pad_top * patch_row_size, zero_point); - } +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < in_shape[0]; b++) { + for (index_t m = 0; m < filter_shape[0]; m += 4) { + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; - const index_t patch_row_size_gated = - std::min(filter_w - pad_left, - in_shape[2] - iw_begin_gated) * in_shape[3]; - MACE_CHECK(patch_row_size_gated == - ((filter_w - (pad_left + pad_right)) * in_shape[3])); - const index_t pad_left_size = pad_left * in_shape[3]; - const index_t pad_right_size = pad_right * in_shape[3]; - index_t im2col_offset = im2col_column_offset + - (pad_top * filter_w + pad_left) * in_shape[3]; - index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2] - + iw_begin_gated) * in_shape[3]; + const int stride_h = stride_hw[0]; + const int stride_w = stride_hw[1]; + const int dilation_h = dilation_hw[0]; + const int dilation_w = dilation_hw[1]; + if (m + 3 < out_channels) { + float *out_ptr0_base = + output + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = out_ptr0_base + out_image_size; + float *out_ptr2_base = out_ptr1_base + out_image_size; + float *out_ptr3_base = out_ptr2_base + out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter + m * in_channels * filter_size + c * filter_size; + const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; + const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; + const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (4 outch x 1 height x 4 width): vo_outch_height + float vo0[4], vo1[4], vo2[4], vo3[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + vo1[ow] = out_ptr1_base[out_offset + ow]; + vo2[ow] = out_ptr2_base[out_offset + ow]; + vo3[ow] = out_ptr3_base[out_offset + ow]; + } + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + // outch 1 + vo1[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr1[kw]; + vo1[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + vo1[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr1[kw]; + // outch 2 + vo2[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr2[kw]; + vo2[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + vo2[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr2[kw]; + // outch 3 + vo3[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr3[kw]; + vo3[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + vo3[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr3[kw]; + } // kw - // fill in effective rows - for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) { - // fill in padding left - if (pad_left > 0) { - const index_t left_offset = im2col_offset - pad_left_size; - std::fill_n(im2col_data + left_offset, pad_left_size, zero_point); - } - // copy effective data - std::copy_n(in_data + in_offset, patch_row_size_gated, - im2col_data + im2col_offset); - // fill in padding right - if (pad_right > 0) { - const index_t right_offset = im2col_offset + patch_row_size_gated; - std::fill_n(im2col_data + right_offset, pad_right_size, - zero_point); - } - in_offset += input_row_size; - im2col_offset += patch_row_size; - } + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + filter_ptr1 += filter_shape[3]; + filter_ptr2 += filter_shape[3]; + filter_ptr3 += filter_shape[3]; + } // kh - // fill in padding bottom - if (pad_bottom > 0) { - const index_t pad_bottom_size = pad_bottom * patch_row_size; - const index_t bottom_offset = - im2col_column_offset + depth - pad_bottom_size; - std::fill_n(im2col_data + bottom_offset, pad_bottom_size, - zero_point); - } - } - } - } + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + out_ptr1_base[out_offset + ow] = vo1[ow]; + out_ptr2_base[out_offset + ow] = vo2[ow]; + out_ptr3_base[out_offset + ow] = vo3[ow]; + } + + filter_ptr0 -= filter_size; + filter_ptr1 -= filter_size; + filter_ptr2 -= filter_size; + filter_ptr3 -= filter_size; + } // w + } // h + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float *out_ptr0_base = + output + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + const float *in_ptr_base = + input + b * in_batch_size + c * in_image_size; + const float *filter_ptr0 = + filter + mm * in_channels * filter_size + c * filter_size; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input offset + index_t ih = h * stride_h; + index_t iw = w * stride_w; + index_t in_offset = ih * in_width + iw; + // output (1 outch x 1 height x 4 width): vo_outch_height + float vo0[4]; + // load output + index_t out_offset = h * out_width + w; + for (index_t ow = 0; ow < 4; ++ow) { + vo0[ow] = out_ptr0_base[out_offset + ow]; + } + + // calc by row + for (index_t kh = 0; kh < filter_shape[2]; ++kh) { + for (index_t kw = 0; kw < filter_shape[3]; ++kw) { + // outch 0 + vo0[0] += in_ptr_base[in_offset + + kw * dilation_w] * filter_ptr0[kw]; + vo0[1] += in_ptr_base[in_offset + stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[2] += in_ptr_base[in_offset + 2 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + vo0[3] += in_ptr_base[in_offset + 3 * stride_w + + kw * dilation_w] * filter_ptr0[kw]; + } // kw + + in_offset += dilation_h * in_width; + filter_ptr0 += filter_shape[3]; + } // kh + + for (index_t ow = 0; ow < 4; ++ow) { + out_ptr0_base[out_offset + ow] = vo0[ow]; + } + filter_ptr0 -= filter_size; + } // w + } // h + } // c + } // mm + } // if + } // m + } // b } - MaceStatus operator()(const Tensor *input, // NHWC - const Tensor *filter, // OHWI - const Tensor *bias, - Tensor *output, // NHWC - StatsFuture *future) { - MACE_UNUSED(future); + private: + const ActivationType activation_; + const float relux_max_limit_; + bool is_filter_transformed_; + SGemm sgemm_; + + private: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + + +template <> +class Conv2dOp : public ConvPool2dOpBase { + public: + explicit Conv2dOp(OpConstructContext *context) + : ConvPool2dOpBase(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1, "Quantization convolution does not support dilation > 1 yet."); - auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext(); + auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext(); MACE_CHECK_NOTNULL(gemm_context); std::vector output_shape(4); @@ -853,8 +736,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { NHWC, filter->shape().data(), OHWI, - dilations_, - strides_, + dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); @@ -865,8 +748,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { filter->shape().data(), OHWI, paddings_.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), RoundType::FLOOR, output_shape.data()); } @@ -916,7 +799,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { bool im2col_required = filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1; total_scratch_size += (im2col_required ? im2col_size : 0); - ScratchBuffer *scratch = context_->device()->scratch_buffer(); + ScratchBuffer *scratch = context->device()->scratch_buffer(); scratch->Rewind(); scratch->GrowSize(total_scratch_size); @@ -965,50 +848,156 @@ struct Conv2dFunctor : Conv2dFunctorBase { gemm_context, filter_matrix, input_matrix, &output_matrix, -filter->zero_point(), -input->zero_point(), output_pipeline); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } -}; -#ifdef MACE_ENABLE_OPENCL -class OpenCLConv2dKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel); + private: + template + inline void Im2col( + const T *in_data, const std::vector &in_shape, + const index_t filter_h, const index_t filter_w, const index_t stride_h, + const index_t stride_w, const T zero_point, const int pad_height, + const int pad_width, const std::vector &out_shape, + const index_t depth, T* im2col_data) { + const index_t input_row_size = in_shape[2] * in_shape[3]; + const index_t patch_row_size = filter_w * in_shape[3]; + +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t h = 0; h < out_shape[1]; ++h) { + for (index_t w = 0; w < out_shape[2]; ++w) { + // Reshape a patch of input to column, which is corresponding to + // a column of output(:, column). + const index_t ih_begin = h * stride_h - (pad_height >> 1); + const index_t ih_end = ih_begin + filter_h; + const index_t iw_begin = w * stride_w - (pad_width >> 1); + const index_t iw_end = iw_begin + filter_w; + // gate height and width to separate padding + const index_t ih_begin_gated = std::max(0, ih_begin); + const index_t ih_end_gated = std::min(ih_end, in_shape[1]); + const index_t iw_begin_gated = std::max(0, iw_begin); + const index_t iw_end_gated = std::min(iw_end, in_shape[2]); + const index_t pad_top = std::max(0, -ih_begin); + const index_t pad_bottom = ih_end - ih_end_gated; + const index_t pad_left = std::max(0, -iw_begin); + const index_t pad_right = iw_end - iw_end_gated; + index_t im2col_column_offset = + ((b * out_shape[1] + h) * out_shape[2] + w) * depth; + + // fill in padding top + if (pad_top > 0) { + std::fill_n(im2col_data + im2col_column_offset, + pad_top * patch_row_size, zero_point); + } + + const index_t patch_row_size_gated = + std::min(filter_w - pad_left, + in_shape[2] - iw_begin_gated) * in_shape[3]; + MACE_CHECK(patch_row_size_gated == + ((filter_w - (pad_left + pad_right)) * in_shape[3])); + const index_t pad_left_size = pad_left * in_shape[3]; + const index_t pad_right_size = pad_right * in_shape[3]; + index_t im2col_offset = im2col_column_offset + + (pad_top * filter_w + pad_left) * in_shape[3]; + index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2] + + iw_begin_gated) * in_shape[3]; + + // fill in effective rows + for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) { + // fill in padding left + if (pad_left > 0) { + const index_t left_offset = im2col_offset - pad_left_size; + std::fill_n(im2col_data + left_offset, pad_left_size, zero_point); + } + // copy effective data + std::copy_n(in_data + in_offset, patch_row_size_gated, + im2col_data + im2col_offset); + // fill in padding right + if (pad_right > 0) { + const index_t right_offset = im2col_offset + patch_row_size_gated; + std::fill_n(im2col_data + right_offset, pad_right_size, + zero_point); + } + in_offset += input_row_size; + im2col_offset += patch_row_size; + } + + // fill in padding bottom + if (pad_bottom > 0) { + const index_t pad_bottom_size = pad_bottom * patch_row_size; + const index_t bottom_offset = + im2col_column_offset + depth - pad_bottom_size; + std::fill_n(im2col_data + bottom_offset, pad_bottom_size, + zero_point); + } + } + } + } + } + + private: + const ActivationType activation_; + const float relux_max_limit_; + + private: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; +#ifdef MACE_ENABLE_OPENCL template -struct Conv2dFunctor : Conv2dFunctorBase { - Conv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding &padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit); - - MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); +class Conv2dOp : public ConvPool2dOpBase { + public: + explicit Conv2dOp(OpConstructContext *context) + : ConvPool2dOpBase(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::Conv2dKernel); + } else { + kernel_.reset(new opencl::buffer::Conv2dKernel); + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + return kernel_->Compute(context, input, filter, bias, + strides_.data(), padding_type_, paddings_, + dilations_.data(), activation_, relux_max_limit_, + output); + } + private: + const ActivationType activation_; + const float relux_max_limit_; std::unique_ptr kernel_; + + private: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; #endif // MACE_ENABLE_OPENCL + +void RegisterConv2D(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, + DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_CONV_2D_H_ diff --git a/mace/ops/conv_pool_2d_base.h b/mace/kernels/conv_pool_2d_base.h similarity index 59% rename from mace/ops/conv_pool_2d_base.h rename to mace/kernels/conv_pool_2d_base.h index 0a8a8c174617dd0474ec4bdc8e82375c291f5f2a..d1e59c6168375bc8726ef8f6d81e47fe725b52a8 100644 --- a/mace/ops/conv_pool_2d_base.h +++ b/mace/kernels/conv_pool_2d_base.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_ -#define MACE_OPS_CONV_POOL_2D_BASE_H_ +#ifndef MACE_KERNELS_CONV_POOL_2D_BASE_H_ +#define MACE_KERNELS_CONV_POOL_2D_BASE_H_ #include @@ -21,18 +21,17 @@ #include "mace/kernels/conv_pool_2d_util.h" namespace mace { -namespace ops { +namespace kernels { -template -class ConvPool2dOpBase : public Operator { +class ConvPool2dOpBase : public Operation { public: - ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - strides_(OperatorBase::GetRepeatedArgs("strides")), - padding_type_(static_cast(OperatorBase::GetOptionalArg( + explicit ConvPool2dOpBase(OpConstructContext *context) + : Operation(context), + strides_(Operation::GetRepeatedArgs("strides")), + padding_type_(static_cast(Operation::GetOptionalArg( "padding", static_cast(SAME)))), - paddings_(OperatorBase::GetRepeatedArgs("padding_values")), - dilations_(OperatorBase::GetRepeatedArgs("dilations", {1, 1})) {} + paddings_(Operation::GetRepeatedArgs("padding_values")), + dilations_(Operation::GetRepeatedArgs("dilations", {1, 1})) {} protected: std::vector strides_; @@ -41,7 +40,7 @@ class ConvPool2dOpBase : public Operator { std::vector dilations_; }; -} // namespace ops +} // namespace kernels } // namespace mace -#endif // MACE_OPS_CONV_POOL_2D_BASE_H_ +#endif // MACE_KERNELS_CONV_POOL_2D_BASE_H_ diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index ce9fb39c77cf592f6b8a34a137af8f195e30364e..c4669f4cf03349967b6897bf36eab9cf0a7b4ee5 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -362,7 +362,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor, } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, @@ -408,7 +408,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } @@ -460,7 +460,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace kernels diff --git a/mace/kernels/crop.h b/mace/kernels/crop.cc similarity index 63% rename from mace/kernels/crop.h rename to mace/kernels/crop.cc index 0838b69a91b91ae70a7ad16d4a264163c392e3a0..6b1ffa6a12d6e5aa8e5779193bed7b328950e4f6 100644 --- a/mace/kernels/crop.h +++ b/mace/kernels/crop.cc @@ -12,65 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_CROP_H_ -#define MACE_KERNELS_CROP_H_ - #include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/core/types.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/crop.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct CropFunctor : OpKernel { - CropFunctor(OpKernelContext *context, - const int axis, - const std::vector &offset) - : OpKernel(context), - axis_(axis), - offset_(offset) {} - - void crop_copy(const T* input_data, T* output_data, - const std::vector &input_shape, - const std::vector &output_shape, - const int32_t* offsets) { - const index_t out_img_size = - output_shape[1] * output_shape[2] * output_shape[3]; - const index_t out_hw = output_shape[2] * output_shape[3]; - const index_t in_img_size = - input_shape[1] * input_shape[2] * input_shape[3]; - const index_t in_hw = input_shape[2] * input_shape[3]; -#pragma omp parallel for collapse(3) - for (int b = 0; b < output_shape[0]; ++b) { - for (int c = 0; c < output_shape[1]; ++c) { - for (int h = 0; h < output_shape[2]; ++h) { - T* out_ptr = - output_data + b * out_img_size + c * out_hw + h * output_shape[3]; - const T* in_ptr_bch = - input_data + (b + offsets[0]) * in_img_size + - (c + offsets[1]) * in_hw + - (h + offsets[2]) * input_shape[3] + offsets[3]; - memcpy(out_ptr, in_ptr_bch, - output_shape[3] * sizeof(T)); - } - } - } - } - - MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - - MACE_CHECK(input_list.size() == 2, "Crop op needs two inputs."); - const Tensor *input0 = input_list[0]; - const Tensor *input1 = input_list[1]; +template +class CropOp : public Operation { + public: + explicit CropOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 2)), + offset_(Operation::GetRepeatedArgs("offset")) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs."); + Tensor *output = this->Output(0); + const Tensor *input0 = inputs_[0]; + const Tensor *input1 = inputs_[1]; const uint32_t in0_dims = static_cast(input0->dim_size()); const uint32_t in1_dims = static_cast(input0->dim_size()); @@ -91,8 +56,8 @@ struct CropFunctor : OpKernel { crop_offset = offset_[i - axis_]; } MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i)) - << "the crop for dimension" << i << "is out of bound with size" - << input1->dim(i) << "and offset" << crop_offset; + << "the crop for dimension" << i << "is out of bound with size" + << input1->dim(i) << "and offset" << crop_offset; } output_shape[i] = new_size; offsets[i] = crop_offset; @@ -105,37 +70,78 @@ struct CropFunctor : OpKernel { crop_copy(input_data, output_data, input0->shape(), output_shape, offsets.data()); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; + } + + private: + void crop_copy(const T* input_data, T* output_data, + const std::vector &input_shape, + const std::vector &output_shape, + const int32_t* offsets) { + const index_t out_img_size = + output_shape[1] * output_shape[2] * output_shape[3]; + const index_t out_hw = output_shape[2] * output_shape[3]; + const index_t in_img_size = + input_shape[1] * input_shape[2] * input_shape[3]; + const index_t in_hw = input_shape[2] * input_shape[3]; +#pragma omp parallel for collapse(3) + for (int b = 0; b < output_shape[0]; ++b) { + for (int c = 0; c < output_shape[1]; ++c) { + for (int h = 0; h < output_shape[2]; ++h) { + T* out_ptr = + output_data + b * out_img_size + c * out_hw + h * output_shape[3]; + const T* in_ptr_bch = + input_data + (b + offsets[0]) * in_img_size + + (c + offsets[1]) * in_hw + + (h + offsets[2]) * input_shape[3] + offsets[3]; + memcpy(out_ptr, in_ptr_bch, + output_shape[3] * sizeof(T)); + } + } + } } + private: const int axis_; std::vector offset_; }; #ifdef MACE_ENABLE_OPENCL -class OpenCLCropKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const std::vector &input_list, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel); -}; template -struct CropFunctor : OpKernel { - CropFunctor(OpKernelContext *context, - const int axis, - const std::vector &offset); - - MaceStatus operator()(const std::vector &input_list, - Tensor *output, - StatsFuture *future); +class CropOp : public Operation { + public: + explicit CropOp(OpConstructContext *context) + : Operation(context) { + const int axis = Operation::GetOptionalArg("axis", 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::CropKernel( + axis, Operation::GetRepeatedArgs("offset"))); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + return kernel_->Compute(context, inputs_, this->Output(0)); + } + + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterCrop(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Crop", CropOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Crop", CropOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Crop", CropOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_CROP_H_ diff --git a/mace/kernels/deconv_2d.cc b/mace/kernels/deconv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..44c0c119557449e32406ff820f4f21cc4df72840 --- /dev/null +++ b/mace/kernels/deconv_2d.cc @@ -0,0 +1,561 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/deconv_2d.h" + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include +#include +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/operator.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/arm/deconv_2d_neon.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/utils/utils.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/deconv_2d.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +class Deconv2dOpBase : public Operation { + public: + explicit Deconv2dOpBase(OpConstructContext *context) + : Operation(context), + strides_(Operation::GetRepeatedArgs("strides")), + padding_type_(static_cast(Operation::GetOptionalArg( + "padding", static_cast(SAME)))), + paddings_(Operation::GetRepeatedArgs("padding_values")), + model_type_(static_cast( + Operation::GetOptionalArg("framework_type", 0))), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} + + + static void CalcDeconvOutputSize( + const index_t *input_shape, // NHWC + const index_t *filter_shape, // OIHW + const int *strides, + index_t *output_shape, + const int *padding_size, + int *input_padding, + const bool isNCHW = false) { + MACE_CHECK_NOTNULL(output_shape); + MACE_CHECK_NOTNULL(padding_size); + MACE_CHECK_NOTNULL(input_shape); + MACE_CHECK_NOTNULL(filter_shape); + MACE_CHECK_NOTNULL(strides); + + const index_t output_channel = filter_shape[0]; + + const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; + const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; + + const index_t kernel_h = filter_shape[2]; + const index_t kernel_w = filter_shape[3]; + + input_padding[0] = static_cast((kernel_h -1) * 2 - padding_size[0]); + input_padding[1] = static_cast((kernel_w -1) * 2 - padding_size[1]); + input_padding[0] = std::max(0, input_padding[0]); + input_padding[1] = std::max(0, input_padding[1]); + + index_t out_height = + (in_height - 1) * strides[0] + kernel_h - padding_size[0]; + index_t out_width = + (in_width - 1) * strides[1] + kernel_w - padding_size[1]; + + output_shape[0] = input_shape[0]; + if (isNCHW) { + output_shape[1] = output_channel; + output_shape[2] = out_height; + output_shape[3] = out_width; + } else { + output_shape[1] = out_height; + output_shape[2] = out_width; + output_shape[3] = output_channel; + } + } + + static void CalcDeconvPaddingAndInputSize( + const index_t *input_shape, // NHWC + const index_t *filter_shape, // OIHW + const int *strides, + Padding padding, + const index_t *output_shape, + int *padding_size, + const bool isNCHW = false) { + MACE_CHECK_NOTNULL(output_shape); + MACE_CHECK_NOTNULL(padding_size); + MACE_CHECK_NOTNULL(input_shape); + MACE_CHECK_NOTNULL(filter_shape); + MACE_CHECK_NOTNULL(strides); + + const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; + const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; + + const index_t out_height = isNCHW ? output_shape[2] : output_shape[1]; + const index_t out_width = isNCHW ? output_shape[3] : output_shape[2]; + + const index_t extended_input_height = (in_height - 1) * strides[0] + 1; + const index_t extended_input_width = (in_width - 1) * strides[1] + 1; + + const index_t filter_h = filter_shape[2]; + const index_t filter_w = filter_shape[3]; + + index_t expected_input_height = 0, expected_input_width = 0; + + switch (padding) { + case VALID: + expected_input_height = + (out_height - filter_h + strides[0]) / strides[0]; + expected_input_width = + (out_width - filter_w + strides[1]) / strides[1]; + break; + case SAME: + expected_input_height = + (out_height + strides[0] - 1) / strides[0]; + expected_input_width = + (out_width + strides[1] - 1) / strides[1]; + break; + default: + MACE_CHECK(false, "Unsupported padding type: ", padding); + } + + MACE_CHECK(expected_input_height == in_height, + expected_input_height, "!=", in_height); + MACE_CHECK(expected_input_width == in_width, + expected_input_width, "!=", in_width); + + const int p_h = static_cast(out_height + + filter_h - 1 - extended_input_height); + const int p_w = static_cast(out_width + + filter_w - 1 - extended_input_width); + + padding_size[0] = std::max(0, p_h); + padding_size[1] = std::max(0, p_w); + } + + protected: + std::vector strides_; // [stride_h, stride_w] + const Padding padding_type_; + std::vector paddings_; + const FrameworkType model_type_; + const ActivationType activation_; + const float relux_max_limit_; +}; + +template +class Deconv2dOp; + +template <> +class Deconv2dOp : public Deconv2dOpBase { + public: + explicit Deconv2dOp(OpConstructContext *context) + : Deconv2dOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *filter = this->Input(1); + const Tensor *bias = nullptr; + const Tensor *output_shape_tensor = nullptr; + if (model_type_ == kernels::CAFFE) { + bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; + } else { + output_shape_tensor = + this->InputSize() >= 3 ? this->Input(2) : nullptr; + bias = this->InputSize() >= 4 ? this->Input(3) : nullptr; + } + Tensor *output = this->Output(0); + + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + + std::vector paddings(2); + std::vector out_paddings(2); + std::vector output_shape(4); + if (model_type_ == FrameworkType::TENSORFLOW) { // tensorflow + paddings = std::vector(2, 0); + MACE_CHECK_NOTNULL(output_shape_tensor); + MACE_CHECK(output_shape_tensor->size() == 4); + Tensor::MappingGuard output_shape_mapper(output_shape_tensor); + auto output_shape_data = + output_shape_tensor->data(); + output_shape = + std::vector(output_shape_data, output_shape_data + 4); + + const index_t t = output_shape[1]; + output_shape[1] = output_shape[3]; + output_shape[3] = output_shape[2]; + output_shape[2] = t; + + CalcDeconvPaddingAndInputSize( + input->shape().data(), + filter->shape().data(), + strides_.data(), padding_type_, + output_shape.data(), + paddings.data(), true); + } else { // caffe + out_paddings = paddings_; + output_shape = std::vector(4, 0); + CalcDeconvOutputSize(input->shape().data(), + filter->shape().data(), + strides_.data(), + output_shape.data(), + out_paddings.data(), + paddings.data(), + true); + } + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + index_t kernel_h = filter->dim(2); + index_t kernel_w = filter->dim(3); + const index_t *in_shape = input->shape().data(); + + MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ", + output_shape[1]); + MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", + in_shape[1]); + MACE_CHECK(in_shape[0] == output_shape[0], + "Input/Output batch size mismatch"); + std::function deconv_func; + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard bias_mapper(bias); + Tensor::MappingGuard output_mapper(output); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + + const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h; + const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w; + const index_t pad_h = (padded_out_h - output_shape[2]) / 2; + const index_t pad_w = (padded_out_w - output_shape[3]) / 2; + + std::vector padded_out_shape({output_shape[0], output_shape[1], + padded_out_h, padded_out_w}); + index_t padded_out_size = + std::accumulate(padded_out_shape.begin(), + padded_out_shape.end(), + 1, + std::multiplies()) * sizeof(float); + ScratchBuffer *scratch = context->device()->scratch_buffer(); + scratch->Rewind(); + scratch->GrowSize(padded_out_size); + Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); + padded_out.Reshape(padded_out_shape); + padded_out.Clear(); + auto *padded_out_data = padded_out.mutable_data(); + + bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && + strides_[0] == strides_[1] && strides_[0] == 2; + + bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 1; + bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && + strides_[0] == strides_[1] && strides_[0] == 2; + + if (use_neon_3x3_s1) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK3x3S1(input, + filter, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_3x3_s2) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK3x3S2(input, + filter, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_4x4_s1) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK4x4S1(input, + filter, + in_shape, + padded_out_shape, + padded_output); + }; + } else if (use_neon_4x4_s2) { + deconv_func = [=](const float *input, + const float *filter, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dNeonK4x4S2(input, + filter, + in_shape, + padded_out_shape, + padded_output); + }; + } else { + deconv_func = [=](const float *input, + const float *filter, + const index_t *in_shape, + const index_t *padded_out_shape, + float *padded_output) { + Deconv2dGeneral(input, + filter, + kernel_h, + kernel_w, + strides_.data(), + in_shape, + padded_out_shape, + padded_output); + }; + } + + bool no_pad = + padded_out_h == output_shape[2] && padded_out_w == output_shape[3]; + float *out_data = no_pad ? output_data : padded_out_data; + + deconv_func(input_data, + filter_data, + in_shape, + padded_out_shape.data(), + out_data); + if (!no_pad) { + CropPadOut(out_data, + padded_out_shape.data(), + output_shape.data(), + pad_h, + pad_w, + output_data); + } + + if (bias_data != nullptr) { + const index_t batch = output_shape[0]; + const index_t channels = output_shape[1]; + const index_t img_size = output_shape[2] * output_shape[3]; +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t i = 0; i < img_size; ++i) { + output_data[(b * channels + c) * img_size + i] += + bias_data[c]; + } + } + } + } + + DoActivation(output_data, + output_data, + output->size(), + activation_, + relux_max_limit_); + + return MaceStatus::MACE_SUCCESS; + } + + private: + void Deconv2dGeneral(const float *input, + const float *filter, + const index_t kernel_h, + const index_t kernel_w, + const int *strides, + const index_t *in_shape, + const index_t *out_shape, + float *output) { + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_img_size = out_height * out_width; + const index_t in_img_size = in_height * in_width; + + const int kernel_size = static_cast(kernel_h * kernel_w); + std::vector index_map(kernel_size, 0); + for (index_t i = 0; i < kernel_h; ++i) { + for (index_t j = 0; j < kernel_w; ++j) { + index_map[i * kernel_w + j] = i * out_width + j; + } + } + + const index_t batch = in_shape[0]; + const index_t out_channels = out_shape[1]; + const index_t in_channels = in_shape[1]; + +#pragma omp parallel for collapse(2) + for (int b = 0; b < batch; ++b) { + for (int oc = 0; oc < out_channels; ++oc) { + float *out_base = + output + (b * out_channels + oc) * out_img_size; + for (int i = 0; i < in_height; ++i) { + for (int j = 0; j < in_width; ++j) { + const index_t out_offset = + i * strides[0] * out_width + j * strides[1]; + for (int ic = 0; ic < in_channels; ++ic) { + const index_t input_idx = + (b * in_channels + ic) * in_img_size + i * in_width + j; + const float val = input[input_idx]; + const index_t kernel_offset = + (oc * in_channels + ic) * kernel_size; + for (int k = 0; k < kernel_size; ++k) { + const index_t out_idx = out_offset + index_map[k]; + const index_t kernel_idx = kernel_offset + k; + out_base[out_idx] += val * filter[kernel_idx]; + } + } + } + } + } + } + } + + void CropPadOut(const float *input, + const index_t *in_shape, + const index_t *out_shape, + const index_t pad_h, + const index_t pad_w, + float *output) { + const index_t batch = in_shape[0]; + const index_t channel = in_shape[1]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; +#pragma omp parallel for collapse(3) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channel; ++j) { + for (int k = 0; k < out_height; ++k) { + const float *input_base = + input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; + float *output_base = + output + ((i * channel + j) * out_height + k)* out_width; + memcpy(output_base, input_base + pad_w, out_width * sizeof(float)); + } + } + } + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +class Deconv2dOp : public Deconv2dOpBase { + public: + explicit Deconv2dOp(OpConstructContext *context) + : Deconv2dOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::Deconv2dKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + const Tensor *filter = this->Input(1); + const Tensor *bias = nullptr; + const Tensor *output_shape_tensor = nullptr; + if (model_type_ == kernels::CAFFE) { + bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; + } else { + output_shape_tensor = + this->InputSize() >= 3 ? this->Input(2) : nullptr; + bias = this->InputSize() >= 4 ? this->Input(3) : nullptr; + } + Tensor *output = this->Output(0); + + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + std::vector paddings(2); + std::vector out_paddings(2); + std::vector output_shape(4); + if (model_type_ == FrameworkType::TENSORFLOW) { + paddings = std::vector(2, 0); + MACE_CHECK_NOTNULL(output_shape_tensor); + MACE_CHECK(output_shape_tensor->size() == 4); + Tensor::MappingGuard output_shape_mapper(output_shape_tensor); + auto output_shape_data = + output_shape_tensor->data(); + output_shape = + std::vector(output_shape_data, output_shape_data + 4); + CalcDeconvPaddingAndInputSize(input->shape().data(), + filter->shape().data(), + strides_.data(), + padding_type_, + output_shape.data(), + paddings.data()); + } else { + out_paddings = paddings_; + paddings = std::vector(2, 0); + output_shape = std::vector(4, 0); + CalcDeconvOutputSize(input->shape().data(), + filter->shape().data(), + strides_.data(), + output_shape.data(), + out_paddings.data(), + paddings.data()); + } + + return kernel_->Compute(context, input, filter, bias, + strides_.data(), paddings.data(), activation_, + relux_max_limit_, output_shape, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterDeconv2D(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h index 7e1ed460272658f871b2695254183a716868399c..25413d9884a02207e69d320e284cbb17852bb704 100644 --- a/mace/kernels/deconv_2d.h +++ b/mace/kernels/deconv_2d.h @@ -15,22 +15,6 @@ #ifndef MACE_KERNELS_DECONV_2D_H_ #define MACE_KERNELS_DECONV_2D_H_ -#if defined(MACE_ENABLE_NEON) -#include -#endif - -#include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/arm/deconv_2d_neon.h" -#include "mace/kernels/conv_pool_2d_util.h" -#include "mace/utils/utils.h" - namespace mace { namespace kernels { @@ -39,483 +23,6 @@ enum FrameworkType { CAFFE = 1, }; -struct Deconv2dFunctorBase : OpKernel { - Deconv2dFunctorBase(OpKernelContext *context, - const std::vector &strides, - const Padding &padding_type, - const std::vector &paddings, - const FrameworkType model_type, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - strides_(strides), - padding_type_(padding_type), - paddings_(paddings), - model_type_(model_type), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - static void CalcDeconvOutputSize( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int *strides, - index_t *output_shape, - const int *padding_size, - int *input_padding, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(output_shape); - MACE_CHECK_NOTNULL(padding_size); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t output_channel = filter_shape[0]; - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t kernel_h = filter_shape[2]; - const index_t kernel_w = filter_shape[3]; - - input_padding[0] = static_cast((kernel_h -1) * 2 - padding_size[0]); - input_padding[1] = static_cast((kernel_w -1) * 2 - padding_size[1]); - input_padding[0] = std::max(0, input_padding[0]); - input_padding[1] = std::max(0, input_padding[1]); - - index_t out_height = - (in_height - 1) * strides[0] + kernel_h - padding_size[0]; - index_t out_width = - (in_width - 1) * strides[1] + kernel_w - padding_size[1]; - - output_shape[0] = input_shape[0]; - if (isNCHW) { - output_shape[1] = output_channel; - output_shape[2] = out_height; - output_shape[3] = out_width; - } else { - output_shape[1] = out_height; - output_shape[2] = out_width; - output_shape[3] = output_channel; - } - } - - static void CalcDeconvPaddingAndInputSize( - const index_t *input_shape, // NHWC - const index_t *filter_shape, // OIHW - const int *strides, - Padding padding, - const index_t *output_shape, - int *padding_size, - const bool isNCHW = false) { - MACE_CHECK_NOTNULL(output_shape); - MACE_CHECK_NOTNULL(padding_size); - MACE_CHECK_NOTNULL(input_shape); - MACE_CHECK_NOTNULL(filter_shape); - MACE_CHECK_NOTNULL(strides); - - const index_t in_height = isNCHW ? input_shape[2] : input_shape[1]; - const index_t in_width = isNCHW ? input_shape[3] : input_shape[2]; - - const index_t out_height = isNCHW ? output_shape[2] : output_shape[1]; - const index_t out_width = isNCHW ? output_shape[3] : output_shape[2]; - - const index_t extended_input_height = (in_height - 1) * strides[0] + 1; - const index_t extended_input_width = (in_width - 1) * strides[1] + 1; - - const index_t filter_h = filter_shape[2]; - const index_t filter_w = filter_shape[3]; - - index_t expected_input_height = 0, expected_input_width = 0; - - switch (padding) { - case VALID: - expected_input_height = - (out_height - filter_h + strides[0]) / strides[0]; - expected_input_width = - (out_width - filter_w + strides[1]) / strides[1]; - break; - case SAME: - expected_input_height = - (out_height + strides[0] - 1) / strides[0]; - expected_input_width = - (out_width + strides[1] - 1) / strides[1]; - break; - default: - MACE_CHECK(false, "Unsupported padding type: ", padding); - } - - MACE_CHECK(expected_input_height == in_height, - expected_input_height, "!=", in_height); - MACE_CHECK(expected_input_width == in_width, - expected_input_width, "!=", in_width); - - const int p_h = static_cast(out_height + - filter_h - 1 - extended_input_height); - const int p_w = static_cast(out_width + - filter_w - 1 - extended_input_width); - - padding_size[0] = std::max(0, p_h); - padding_size[1] = std::max(0, p_w); - } - - std::vector strides_; // [stride_h, stride_w] - const Padding padding_type_; - std::vector paddings_; - const FrameworkType model_type_; - const ActivationType activation_; - const float relux_max_limit_; -}; - - -template -struct Deconv2dFunctor; - -template<> -struct Deconv2dFunctor: Deconv2dFunctorBase { - Deconv2dFunctor(OpKernelContext *context, - const std::vector &strides, - const Padding &padding_type, - const std::vector &paddings, - const FrameworkType model_type, - const ActivationType activation, - const float relux_max_limit) - : Deconv2dFunctorBase(context, - strides, - padding_type, - paddings, - model_type, - activation, - relux_max_limit) {} - - void Deconv2dGeneral(const float *input, - const float *filter, - const index_t kernel_h, - const index_t kernel_w, - const int *strides, - const index_t *in_shape, - const index_t *out_shape, - float *output) { - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_img_size = out_height * out_width; - const index_t in_img_size = in_height * in_width; - - const int kernel_size = static_cast(kernel_h * kernel_w); - std::vector index_map(kernel_size, 0); - for (index_t i = 0; i < kernel_h; ++i) { - for (index_t j = 0; j < kernel_w; ++j) { - index_map[i * kernel_w + j] = i * out_width + j; - } - } - - const index_t batch = in_shape[0]; - const index_t out_channels = out_shape[1]; - const index_t in_channels = in_shape[1]; - -#pragma omp parallel for collapse(2) - for (int b = 0; b < batch; ++b) { - for (int oc = 0; oc < out_channels; ++oc) { - float *out_base = - output + (b * out_channels + oc) * out_img_size; - for (int i = 0; i < in_height; ++i) { - for (int j = 0; j < in_width; ++j) { - const index_t out_offset = - i * strides[0] * out_width + j * strides[1]; - for (int ic = 0; ic < in_channels; ++ic) { - const index_t input_idx = - (b * in_channels + ic) * in_img_size + i * in_width + j; - const float val = input[input_idx]; - const index_t kernel_offset = - (oc * in_channels + ic) * kernel_size; - for (int k = 0; k < kernel_size; ++k) { - const index_t out_idx = out_offset + index_map[k]; - const index_t kernel_idx = kernel_offset + k; - out_base[out_idx] += val * filter[kernel_idx]; - } - } - } - } - } - } - } - - void CropPadOut(const float *input, - const index_t *in_shape, - const index_t *out_shape, - const index_t pad_h, - const index_t pad_w, - float *output) { - const index_t batch = in_shape[0]; - const index_t channel = in_shape[1]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; -#pragma omp parallel for collapse(3) - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channel; ++j) { - for (int k = 0; k < out_height; ++k) { - const float *input_base = - input + ((i * channel + j) * in_height + (k + pad_h)) * in_width; - float *output_base = - output + ((i * channel + j) * out_height + k)* out_width; - memcpy(output_base, input_base + pad_w, out_width * sizeof(float)); - } - } - } - } - - MaceStatus operator()(const Tensor *input, // NCHW - const Tensor *filter, // OIHW - const Tensor *bias, - const Tensor *output_shape_tensor, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK_NOTNULL(input); - MACE_CHECK_NOTNULL(filter); - MACE_CHECK_NOTNULL(output); - - std::vector paddings(2); - std::vector out_paddings(2); - std::vector output_shape(4); - if (model_type_ == FrameworkType::TENSORFLOW) { // tensorflow - paddings = std::vector(2, 0); - MACE_CHECK_NOTNULL(output_shape_tensor); - MACE_CHECK(output_shape_tensor->size() == 4); - Tensor::MappingGuard output_shape_mapper(output_shape_tensor); - auto output_shape_data = - output_shape_tensor->data(); - output_shape = - std::vector(output_shape_data, output_shape_data + 4); - - const index_t t = output_shape[1]; - output_shape[1] = output_shape[3]; - output_shape[3] = output_shape[2]; - output_shape[2] = t; - - CalcDeconvPaddingAndInputSize( - input->shape().data(), - filter->shape().data(), - strides_.data(), padding_type_, - output_shape.data(), - paddings.data(), true); - } else { // caffe - out_paddings = paddings_; - output_shape = std::vector(4, 0); - CalcDeconvOutputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - output_shape.data(), - out_paddings.data(), - paddings.data(), - true); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - const index_t *in_shape = input->shape().data(); - - MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ", - output_shape[1]); - MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ", - in_shape[1]); - MACE_CHECK(in_shape[0] == output_shape[0], - "Input/Output batch size mismatch"); - std::function deconv_func; - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard bias_mapper(bias); - Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto filter_data = filter->data(); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - - const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h; - const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w; - const index_t pad_h = (padded_out_h - output_shape[2]) / 2; - const index_t pad_w = (padded_out_w - output_shape[3]) / 2; - - std::vector padded_out_shape({output_shape[0], output_shape[1], - padded_out_h, padded_out_w}); - index_t padded_out_size = - std::accumulate(padded_out_shape.begin(), - padded_out_shape.end(), - 1, - std::multiplies()) * sizeof(float); - ScratchBuffer *scratch = context_->device()->scratch_buffer(); - scratch->Rewind(); - scratch->GrowSize(padded_out_size); - Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT); - padded_out.Reshape(padded_out_shape); - padded_out.Clear(); - auto *padded_out_data = padded_out.mutable_data(); - - bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 && - strides_[0] == strides_[1] && strides_[0] == 1; - bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 && - strides_[0] == strides_[1] && strides_[0] == 2; - - bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 && - strides_[0] == strides_[1] && strides_[0] == 1; - bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 && - strides_[0] == strides_[1] && strides_[0] == 2; - - if (use_neon_3x3_s1) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - Deconv2dNeonK3x3S1(input, - filter, - in_shape, - padded_out_shape, - padded_output); - }; - } else if (use_neon_3x3_s2) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - Deconv2dNeonK3x3S2(input, - filter, - in_shape, - padded_out_shape, - padded_output); - }; - } else if (use_neon_4x4_s1) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - Deconv2dNeonK4x4S1(input, - filter, - in_shape, - padded_out_shape, - padded_output); - }; - } else if (use_neon_4x4_s2) { - deconv_func = [=](const float *input, - const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - Deconv2dNeonK4x4S2(input, - filter, - in_shape, - padded_out_shape, - padded_output); - }; - } else { - deconv_func = [=](const float *input, - const float *filter, - const index_t *in_shape, - const index_t *padded_out_shape, - float *padded_output) { - Deconv2dGeneral(input, - filter, - kernel_h, - kernel_w, - strides_.data(), - in_shape, - padded_out_shape, - padded_output); - }; - } - - bool no_pad = - padded_out_h == output_shape[2] && padded_out_w == output_shape[3]; - float *out_data = no_pad ? output_data : padded_out_data; - - deconv_func(input_data, - filter_data, - in_shape, - padded_out_shape.data(), - out_data); - if (!no_pad) { - CropPadOut(out_data, - padded_out_shape.data(), - output_shape.data(), - pad_h, - pad_w, - output_data); - } - - if (bias_data != nullptr) { - const index_t batch = output_shape[0]; - const index_t channels = output_shape[1]; - const index_t img_size = output_shape[2] * output_shape[3]; -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t i = 0; i < img_size; ++i) { - output_data[(b * channels + c) * img_size + i] += - bias_data[c]; - } - } - } - } - - DoActivation(output_data, - output_data, - output->size(), - activation_, - relux_max_limit_); - - return MACE_SUCCESS; - } -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLDeconv2dKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const int *padding_data, - const ActivationType activation, - const float relux_max_limit, - const std::vector &output_shape, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel); -}; -template -struct Deconv2dFunctor : Deconv2dFunctorBase { - Deconv2dFunctor(OpKernelContext *context, - const std::vector &strides, - const Padding &padding_type, - const std::vector &paddings, - const FrameworkType model_type, - const ActivationType activation, - const float relux_max_limit); - - MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const Tensor *output_shape_tensor, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - } // namespace kernels } // namespace mace diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.cc similarity index 62% rename from mace/kernels/depth_to_space.h rename to mace/kernels/depth_to_space.cc index e73dec7660e4f61bafc5356e499e1076d5f1ef79..cd10b2b018ee1a52c430e3d5748b107d46c40dd5 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.cc @@ -12,32 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_DEPTH_TO_SPACE_H_ -#define MACE_KERNELS_DEPTH_TO_SPACE_H_ #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" - +#include "mace/core/operator.h" #ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/kernels/opencl/image/depth_to_space.h" #endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct DepthToSpaceOpFunctor : OpKernel { - DepthToSpaceOpFunctor(OpKernelContext *context, - const int block_size) - : OpKernel(context), block_size_(block_size) {} - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template +class DepthToSpaceOp : public Operation { + public: + explicit DepthToSpaceOp(OpConstructContext *context) + : Operation(context), + block_size_(Operation::GetOptionalArg("block_size", 1)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); const index_t batch_size = input->dim(0); const index_t input_depth = input->dim(1); const index_t input_height = input->dim(2); @@ -85,36 +82,50 @@ struct DepthToSpaceOpFunctor : OpKernel { } } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: const int block_size_; }; #ifdef MACE_ENABLE_OPENCL -class OpenCLDepthToSpaceKernel { +template +class DepthToSpaceOp : public Operation { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel); -}; -template -struct DepthToSpaceOpFunctor : OpKernel { - DepthToSpaceOpFunctor(OpKernelContext *context, - const int block_size); - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); + explicit DepthToSpaceOp(OpConstructContext *context) + : Operation(context) { + int block_size = Operation::GetOptionalArg("block_size", 1); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::DepthToSpaceKernel(block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); + return kernel_->Compute(context, input, output); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL +void RegisterDepthToSpace(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "DepthToSpace", + DepthToSpaceOp, DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "DepthToSpace", + DepthToSpaceOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "DepthToSpace", + DepthToSpaceOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_DEPTH_TO_SPACE_H_ diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.cc similarity index 74% rename from mace/kernels/depthwise_conv2d.h rename to mace/kernels/depthwise_conv2d.cc index a7765b3022f0a57ee3e0826d8c8037761c62150a..74def6cf635375123f5d497f14ca739f20a2f878 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.cc @@ -12,14 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_ -#define MACE_KERNELS_DEPTHWISE_CONV2D_H_ - #if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #include #endif #include #include +#include #include // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it @@ -27,120 +25,51 @@ #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" #include "mace/core/future.h" -#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/core/operator.h" #include "mace/kernels/activation.h" #include "mace/kernels/arm/depthwise_conv2d_neon.h" -#include "mace/kernels/quantize.h" +#include "mace/kernels/conv_pool_2d_base.h" #include "mace/public/mace.h" - +#include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/kernels/opencl/image/depthwise_conv2d.h" +#include "mace/kernels/opencl/buffer/depthwise_conv2d.h" #endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -struct DepthwiseConv2dFunctorBase : OpKernel { - DepthwiseConv2dFunctorBase(OpKernelContext *context, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - strides_(strides), - padding_type_(padding_type), - paddings_(paddings), - dilations_(dilations), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - const int *strides_; // [stride_h, stride_w] - const Padding padding_type_; - std::vector paddings_; - const int *dilations_; // [dilation_h, dilation_w] +class DepthwiseConv2dOpBase : public ConvPool2dOpBase { + public: + explicit DepthwiseConv2dOpBase(OpConstructContext *context) + : ConvPool2dOpBase(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} + protected: const ActivationType activation_; const float relux_max_limit_; }; -template -struct DepthwiseConv2dFunctor; - -template<> -struct DepthwiseConv2dFunctor - : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : DepthwiseConv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) {} - - void DepthwiseConv2dGeneral(const float *input, - const float *filter, - const index_t *in_shape, - const index_t *out_shape, - const index_t *filter_shape, - const int *stride_hw, - const int *dilation_hw, - const int *pad_hw, - float *output) { - const index_t multiplier = filter_shape[0] / filter_shape[1]; -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < in_shape[0]; ++b) { - for (index_t m = 0; m < filter_shape[0]; ++m) { - for (index_t h = 0; h < out_shape[2]; ++h) { - for (index_t w = 0; w < out_shape[3]; ++w) { - const index_t out_channels = filter_shape[0]; - const index_t in_channels = filter_shape[1]; - const index_t filter_height = filter_shape[2]; - const index_t filter_width = filter_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - index_t out_offset = - ((b * out_channels + m) * out_height + h) * out_width + w; - index_t c = m / multiplier; - index_t o = m % multiplier; - float sum = 0; - for (index_t kh = 0; kh < filter_height; ++kh) { - for (index_t kw = 0; kw < filter_width; ++kw) { - index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0]; - index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1]; - if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { - index_t in_offset = - ((b * in_channels + c) * in_height + ih) * in_width + iw; - index_t filter_offset = - (((o * in_channels) + c) * filter_height + kh) - * filter_width + kw; +template +class DepthwiseConv2dOp; - sum += input[in_offset] * filter[filter_offset]; - } - } - } - output[out_offset] = sum; - } - } - } +template <> +class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { + public: + explicit DepthwiseConv2dOp(OpConstructContext *context) + : DepthwiseConv2dOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = nullptr; + if (this->InputSize() >= 3) { + bias = this->Input(BIAS); } - } - - MaceStatus operator()(const Tensor *input, // NCHW - const Tensor *filter, // OIHW - const Tensor *bias, - Tensor *output, // NCHW - StatsFuture *future) { - MACE_UNUSED(future); + Tensor *output = this->Output(OUTPUT); MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); @@ -148,14 +77,14 @@ struct DepthwiseConv2dFunctor std::vector output_shape(4); std::vector paddings(2); std::vector filter_shape - {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2), - filter->dim(3)}; + {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2), + filter->dim(3)}; if (paddings_.empty()) { CalcNCHWPaddingAndOutputSize(input->shape().data(), filter_shape.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); @@ -164,8 +93,8 @@ struct DepthwiseConv2dFunctor CalcNCHWOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), RoundType::FLOOR, output_shape.data()); } @@ -230,7 +159,7 @@ struct DepthwiseConv2dFunctor MACE_UNUSED(input_shape); if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { + && dilation_h == 1 && dilation_w == 1) { conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S1(input, filter_data, @@ -244,7 +173,7 @@ struct DepthwiseConv2dFunctor output); }; } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 - && dilation_h == 1 && dilation_w == 1) { + && dilation_h == 1 && dilation_w == 1) { conv_func = [=](const float *input, float *output) { DepthwiseConv2dNeonK3x3S2(input, filter_data, @@ -264,8 +193,8 @@ struct DepthwiseConv2dFunctor input_shape, output_shape.data(), filter_shape.data(), - strides_, - dilations_, + strides_.data(), + dilations_.data(), pad_hw, output); }; @@ -279,7 +208,7 @@ struct DepthwiseConv2dFunctor for (index_t c = 0; c < channels; ++c) { for (index_t i = 0; i < height * width; ++i) { output_data[(b * channels + c) * height * width + i] += - bias_data[c]; + bias_data[c]; } } } @@ -288,115 +217,81 @@ struct DepthwiseConv2dFunctor DoActivation(output_data, output_data, output->size(), activation_, relux_max_limit_); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } -}; - -template<> -struct DepthwiseConv2dFunctor - : public DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : DepthwiseConv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) {} - void DepthwiseConv2dGeneral(const uint8_t *input, - const uint8_t *filter, - const int32_t *bias, + private: + void DepthwiseConv2dGeneral(const float *input, + const float *filter, const index_t *in_shape, const index_t *out_shape, const index_t *filter_shape, - const int32_t input_zero, - const int32_t filter_zero, - const int32_t output_zero, - const float output_multiplier, const int *stride_hw, const int *dilation_hw, const int *pad_hw, - uint8_t *output) { + float *output) { + const index_t multiplier = filter_shape[0] / filter_shape[1]; #pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - for (index_t m = 0; m < out_shape[3]; ++m) { - const index_t filter_height = filter_shape[0]; - const index_t filter_width = filter_shape[1]; - const index_t in_channels = filter_shape[2]; - const index_t depth_multiplier = filter_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t out_channels = out_shape[3]; + for (index_t b = 0; b < in_shape[0]; ++b) { + for (index_t m = 0; m < filter_shape[0]; ++m) { + for (index_t h = 0; h < out_shape[2]; ++h) { + for (index_t w = 0; w < out_shape[3]; ++w) { + const index_t out_channels = filter_shape[0]; + const index_t in_channels = filter_shape[1]; + const index_t filter_height = filter_shape[2]; + const index_t filter_width = filter_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; index_t out_offset = - ((b * out_height + h) * out_width + w) * out_channels + m; - index_t c = m / depth_multiplier; - index_t o = m % depth_multiplier; - index_t ih_base = h * stride_hw[0] - pad_hw[0]; - index_t iw_base = w * stride_hw[1] - pad_hw[1]; - int32_t sum = 0; + ((b * out_channels + m) * out_height + h) * out_width + w; + index_t c = m / multiplier; + index_t o = m % multiplier; + float sum = 0; for (index_t kh = 0; kh < filter_height; ++kh) { - const index_t ih = ih_base + kh * dilation_hw[0]; for (index_t kw = 0; kw < filter_width; ++kw) { - const index_t iw = iw_base + kw * dilation_hw[1]; + index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0]; + index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1]; if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { index_t in_offset = - ((b * in_height + ih) * in_width + iw) * in_channels + c; + ((b * in_channels + c) * in_height + ih) * in_width + iw; index_t filter_offset = - ((kh * filter_width + kw) * in_channels + c) - * depth_multiplier + o; + (((o * in_channels) + c) * filter_height + kh) + * filter_width + kw; - sum += (input[in_offset] - input_zero) * - (filter[filter_offset] - filter_zero); + sum += input[in_offset] * filter[filter_offset]; } } } - if (bias) { - sum += bias[m]; - } - sum = static_cast(std::round(sum * output_multiplier)); - sum += output_zero; - output[out_offset] = - static_cast(std::min(255, std::max(0, sum))); + output[out_offset] = sum; } } } } } - inline tflite::Dims<4> ShapeToTfliteDims(const std::vector &shape) { - tflite::Dims<4> d; - for (int i = 0; i < 4; ++i) { - int src = static_cast(shape.size() - i - 1); - if (src >= 0) { - d.sizes[i] = shape[src]; - } else { - d.sizes[i] = 1; - } - } - d.strides[0] = 1; - for (int i = 1; i < 4; i++) { - d.strides[i] = d.strides[i - 1] * d.sizes[i - 1]; - } - return d; - } + protected: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; - MaceStatus operator()(const Tensor *input, // NHWC - const Tensor *filter, // HWIM - const Tensor *bias, - Tensor *output, // NHWC - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class DepthwiseConv2dOp + : public DepthwiseConv2dOpBase { + public: + explicit DepthwiseConv2dOp(OpConstructContext *context) + : DepthwiseConv2dOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = nullptr; + if (this->InputSize() >= 3) { + bias = this->Input(BIAS); + } + Tensor *output = this->Output(OUTPUT); MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); @@ -412,8 +307,8 @@ struct DepthwiseConv2dFunctor NHWC, ohwi_shape.data(), OHWI, - dilations_, - strides_, + dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); @@ -424,8 +319,8 @@ struct DepthwiseConv2dFunctor ohwi_shape.data(), OHWI, paddings_.data(), - dilations_, - strides_, + dilations_.data(), + strides_.data(), RoundType::FLOOR, output_shape.data()); } @@ -493,54 +388,149 @@ struct DepthwiseConv2dFunctor input_data, filter_data, bias_data, input->shape().data(), output_shape.data(), filter->shape().data(), input->zero_point(), filter->zero_point(), output->zero_point(), output_multiplier, - strides_, dilations_, pad_hw, output_data); + strides_.data(), dilations_.data(), pad_hw, output_data); } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: + void DepthwiseConv2dGeneral(const uint8_t *input, + const uint8_t *filter, + const int32_t *bias, + const index_t *in_shape, + const index_t *out_shape, + const index_t *filter_shape, + const int32_t input_zero, + const int32_t filter_zero, + const int32_t output_zero, + const float output_multiplier, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, + uint8_t *output) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t h = 0; h < out_shape[1]; ++h) { + for (index_t w = 0; w < out_shape[2]; ++w) { + for (index_t m = 0; m < out_shape[3]; ++m) { + const index_t filter_height = filter_shape[0]; + const index_t filter_width = filter_shape[1]; + const index_t in_channels = filter_shape[2]; + const index_t depth_multiplier = filter_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t out_channels = out_shape[3]; + index_t out_offset = + ((b * out_height + h) * out_width + w) * out_channels + m; + index_t c = m / depth_multiplier; + index_t o = m % depth_multiplier; + index_t ih_base = h * stride_hw[0] - pad_hw[0]; + index_t iw_base = w * stride_hw[1] - pad_hw[1]; + int32_t sum = 0; + for (index_t kh = 0; kh < filter_height; ++kh) { + const index_t ih = ih_base + kh * dilation_hw[0]; + for (index_t kw = 0; kw < filter_width; ++kw) { + const index_t iw = iw_base + kw * dilation_hw[1]; + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + index_t in_offset = + ((b * in_height + ih) * in_width + iw) * in_channels + c; + index_t filter_offset = + ((kh * filter_width + kw) * in_channels + c) + * depth_multiplier + o; + + sum += (input[in_offset] - input_zero) * + (filter[filter_offset] - filter_zero); + } + } + } + if (bias) { + sum += bias[m]; + } + sum = static_cast(std::round(sum * output_multiplier)); + sum += output_zero; + output[out_offset] = + static_cast(std::min(255, std::max(0, sum))); + } + } + } + } + } + + inline tflite::Dims<4> ShapeToTfliteDims(const std::vector &shape) { + tflite::Dims<4> d; + for (int i = 0; i < 4; ++i) { + int src = static_cast(shape.size() - i - 1); + if (src >= 0) { + d.sizes[i] = shape[src]; + } else { + d.sizes[i] = 1; + } + } + d.strides[0] = 1; + for (int i = 1; i < 4; i++) { + d.strides[i] = d.strides[i - 1] * d.sizes[i - 1]; + } + return d; + } + + protected: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; #ifdef MACE_ENABLE_OPENCL -class OpenCLDepthwiseConv2dKernel { +template +class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel); -}; - -template -struct DepthwiseConv2dFunctor - : DepthwiseConv2dFunctorBase { - DepthwiseConv2dFunctor(OpKernelContext *context, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit); - - MaceStatus operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); + explicit DepthwiseConv2dOp(OpConstructContext *context) + : DepthwiseConv2dOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::DepthwiseConv2dKernel); + } else { + kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = nullptr; + if (this->InputSize() >= 3) { + bias = this->Input(BIAS); + } + Tensor *output = this->Output(OUTPUT); + return kernel_->Compute(context, input, filter, bias, + strides_.data(), padding_type_, paddings_, + dilations_.data(), activation_, relux_max_limit_, + output); + } + private: std::unique_ptr kernel_; + + protected: + MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; #endif // MACE_ENABLE_OPENCL + +void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", + DepthwiseConv2dOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_DEPTHWISE_CONV2D_H_ diff --git a/mace/kernels/eltwise.cc b/mace/kernels/eltwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..e33006ea43d83cdb4307617f730454dc502b6828 --- /dev/null +++ b/mace/kernels/eltwise.cc @@ -0,0 +1,1125 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/eltwise.h" + +#include +#include +#include +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/operator.h" +#include "mace/core/tensor.h" +#include "mace/utils/quantize.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/eltwise.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + + +inline index_t GetIndex(const std::vector &shape, + const std::vector &index) { + index_t idx = 0; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] > 1) { + idx = idx * shape[i] + index[i]; + } + } + return idx; +} + +inline void IncreaseIndex(const std::vector &shape, + std::vector *index) { + for (index_t i = static_cast(shape.size()) - 1; i >= 0; --i) { + ++(*index)[i]; + if ((*index)[i] >= shape[i]) { + (*index)[i] -= shape[i]; + } else { + break; + } + } +} + +template +inline void TensorGeneralBroadcastEltwise( + const EltwiseType type, + const T *input0, + const T *input1, + const std::vector &coeff, + const bool swapped, + const std::vector &input0_shape, + const std::vector &input1_shape, + const std::vector &output_shape, + DstType *output) { + const index_t output_size = std::accumulate( + output_shape.begin(), output_shape.end(), 1, std::multiplies()); + std::vector out_index(output_shape.size(), 0); + switch (type) { + case SUM: + if (coeff.empty()) { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input0[idx0] + input1[idx1]; + IncreaseIndex(output_shape, &out_index); + } + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = + input0[idx0] * coeff_copy[0] + input1[idx1] * coeff_copy[1]; + IncreaseIndex(output_shape, &out_index); + } + } + break; + case SUB: + if (!swapped) { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input0[idx0] - input1[idx1]; + IncreaseIndex(output_shape, &out_index); + } + } else { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input1[idx1] - input0[idx0]; + IncreaseIndex(output_shape, &out_index); + } + } + break; + case PROD: + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input0[idx0] * input1[idx1]; + IncreaseIndex(output_shape, &out_index); + } + break; + case DIV: + if (!swapped) { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input0[idx0] / input1[idx1]; + IncreaseIndex(output_shape, &out_index); + } + } else { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input1[idx1] / input0[idx0]; + IncreaseIndex(output_shape, &out_index); + } + } + break; + case MIN: + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = std::min(input1[idx1], input0[idx0]); + IncreaseIndex(output_shape, &out_index); + } + break; + case MAX: + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = std::max(input1[idx1], input0[idx0]); + IncreaseIndex(output_shape, &out_index); + } + break; + case SQR_DIFF: + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = std::pow(input1[idx1] - input0[idx0], 2.f); + IncreaseIndex(output_shape, &out_index); + } + break; + case POW: + if (!swapped) { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = std::pow(input0[idx0], input1[idx1]); + IncreaseIndex(output_shape, &out_index); + } + } else { + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = std::pow(input1[idx1], input0[idx0]); + IncreaseIndex(output_shape, &out_index); + } + } + break; + case EQUAL: + for (index_t i = 0; i < output_size; ++i) { + const index_t idx0 = GetIndex(input0_shape, out_index); + const index_t idx1 = GetIndex(input1_shape, out_index); + output[i] = input1[idx1] == input0[idx0]; + IncreaseIndex(output_shape, &out_index); + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +template +inline void TensorBroadcastEltwise(const EltwiseType type, + const T *input0, + const T *input1, + const std::vector &coeff, + const index_t diff_size, + const index_t common_size, + const bool swapped, + DstType *output) { + switch (type) { + case SUM: + if (coeff.empty()) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] + input1[i]; + } + } + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] * coeff_copy[0] + + input1[i] * coeff_copy[1]; + } + } + } + break; + case SUB: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] - input1[i]; + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input1[i] - input0[i + d * common_size]; + } + } + } + break; + case PROD: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = input0[i + d * common_size] * input1[i]; + } + } + break; + case DIV: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] / input1[i]; + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input1[i] / input0[i + d * common_size]; + } + } + } + break; + case MIN: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::min(input0[i + d * common_size], input1[i]); + } + } + break; + case MAX: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::max(input0[i + d * common_size], input1[i]); + } + } + break; + case SQR_DIFF: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input0[i + d * common_size] - input1[i], 2.f); + } + } + break; + case POW: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input0[i + d * common_size], input1[i]); + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input1[i], input0[i + d * common_size]); + } + } + } + break; + case NEG: +#pragma omp parallel for + for (index_t i = 0; i < diff_size * common_size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < diff_size * common_size; ++i) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] == input1[i]; + } + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +// Multiplication is costly, so we specialize the following case. +template +inline void TensorEltwise(const EltwiseType type, + const T *input0, + const T *input1, + const std::vector &coeff, + const index_t size, + const bool swapped, + DstType *output) { + switch (type) { + case SUM: + if (coeff.empty()) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] + input1[i]; + } + + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; + } + } + break; + case SUB: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] - input1[i]; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1[i] - input0[i]; + } + } + break; + case PROD: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * input1[i]; + } + + break; + case DIV: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] / input1[i]; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1[i] / input0[i]; + } + } + break; + case MIN: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::min(input0[i], input1[i]); + } + + break; + case MAX: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input0[i], input1[i]); + } + + break; + case SQR_DIFF: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i] - input1[i], 2.f); + } + + break; + case POW: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i], input1[i]); + } + } else { + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input1[i], input0[i]); + } + } + break; + case NEG: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] == input1[i]; + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +// Multiplication is costly, so we specialize the following case. +template +inline void TensorScalarEltwise(const EltwiseType type, + const T *input0, + const T input1, + const std::vector &coeff, + const index_t size, + const bool swapped, + DstType *output) { + switch (type) { + case SUM: + if (coeff.empty()) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] + input1; + } + + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; + } + } + break; + case SUB: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] - input1; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1 - input0[i]; + } + } + break; + case PROD: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * input1; + } + + break; + case DIV: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] / input1; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1 / input0[i]; + } + } + break; + case MIN: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::min(input0[i], input1); + } + + break; + case MAX: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input0[i], input1); + } + + break; + case SQR_DIFF: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i] - input1, 2.f); + } + + break; + case POW: + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i], input1); + } + } else { + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input1, input0[i]); + } + } + break; + case NEG: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] == input1; + } + + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +template +inline void TensorEltwisePerChannel(const EltwiseType type, + const T *input0, + const T *input1, + const std::vector &coeff, + const index_t batch0, + const index_t batch1, + const index_t channel, + const index_t image_size, + const bool swapped, + DstType *output) { + switch (type) { + case SUM: + if (coeff.empty()) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] + in1_ptr[c]; + } + } + } + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = + in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1]; + } + } + } + } + break; + case SUB: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] - in1_ptr[c]; + } + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] - in0_ptr[i]; + } + } + } + } + break; + case PROD: +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] * in1_ptr[c]; + } + } + } + break; + case DIV: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] / in1_ptr[c]; + } + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] / in0_ptr[i]; + } + } + } + } + break; + case MIN: +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]); + } + } + } + break; + case MAX: +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]); + } + } + } + break; + case SQR_DIFF: +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f); + } + } + } + break; + case POW: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]); + } + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]); + } + } + } + } + break; + case NEG: +#pragma omp parallel for + for (index_t i = 0; i < batch0 * channel * image_size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < batch0 * channel * image_size; ++i) { + output[i] = std::fabs(input0[i]); + } + break; + case EQUAL: +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const T *in0_ptr = input0 + ((b * channel) + c) * image_size; + const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + DstType *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] == in1_ptr[c]; + } + } + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +template +class EltwiseOp : public Operation { + public: + explicit EltwiseOp(OpConstructContext *context) + : Operation(context), + type_(static_cast(Operation::GetOptionalArg( + "type", static_cast(kernels::EltwiseType::NONE)))), + coeff_(Operation::GetRepeatedArgs("coeff")), + scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), + scalar_input_index_(Operation::GetOptionalArg( + "scalar_input_index", 1)), + data_format_(static_cast(Operation::GetOptionalArg( + "data_format", 0))) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; + Tensor *output = this->Output(0); + if (input1 == nullptr) { + scalar_tensor_.Resize({}); + Tensor::MappingGuard guard(&scalar_tensor_); + auto scalar_data = scalar_tensor_.mutable_data(); + scalar_data[0] = static_cast(scalar_input_); + input1 = &scalar_tensor_; + } + + if (IsLogicalType(type_)) { + // as we do not have bool-type tensor, we use int type + return DoEltwise(input0, input1, output); + } else { + return DoEltwise(input0, input1, output); + } + } + + private: + template + MaceStatus DoEltwise(const Tensor *input0, + const Tensor *input1, + Tensor *output) { + bool swapped = false; + if (input0->size() < input1->size()) { + std::swap(input0, input1); + swapped = true; + } + if (scalar_input_index_ == 0) { + swapped = !swapped; + } + + // check if we can broadcast tensor + uint32_t rank_diff = + static_cast(input0->dim_size() - input1->dim_size()); + if (data_format_ == NCHW) { + MACE_CHECK( + (input0->dim_size() == 4) && + ((input1->dim_size() == 0) || + (input1->dim_size() == 4 && + input1->dim(1) == input0->dim(1) && + (input1->dim(0) == input0->dim(0) || + input1->dim(0) == 1)) || + (input1->dim_size() == 1 && + input1->dim(0) == input0->dim(1))), + "only support broadcast channel dimension"); + } else { + for (uint32_t i = 0; i < input1->dim_size(); ++i) { + MACE_CHECK(input0->dim(rank_diff + i) == 1 || input1->dim(i) == 1 || + input0->dim(rank_diff + i) == input1->dim(i), + "Element-Wise op only support tail dimensions broadcast"); + } + } + + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + + const T *input0_ptr = input0->data(); + const T *input1_ptr = input1->data(); + + if (data_format_ == NCHW && input1->dim_size() > 0 && + input1->size() < input0->size()) { + MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); + Tensor::MappingGuard output_guard(output); + DstType *output_ptr = output->mutable_data(); + TensorEltwisePerChannel( + type_, input0_ptr, input1_ptr, coeff_, input0->dim(0), + input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1), + input0->dim(2) * input0->dim(3), swapped, output_ptr); + + } else { + const std::vector &input0_shape = input0->shape(); + std::vector input1_shape(rank_diff, 1); + input1_shape.insert(input1_shape.end(), input1->shape().begin(), + input1->shape().end()); + + std::vector output_shape(input0->dim_size(), 0); + for (unsigned int i = 0; i < input0_shape.size(); ++i) { + output_shape[i] = std::max(input0_shape[i], input1_shape[i]); + } + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + Tensor::MappingGuard output_guard(output); + DstType *output_ptr = output->mutable_data(); + + bool need_general_broadcast = false; + for (uint32_t i = 0; i < input1->dim_size(); ++i) { + if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1) || + (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) { + need_general_broadcast = true; + break; + } + } + + if (need_general_broadcast) { + TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, + swapped, input0_shape, input1_shape, + output_shape, output_ptr); + } else if (input1->size() == input0->size()) { + TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(), + swapped, output_ptr); + } else if (input1->size() < input0->size()) { + if (input1->size() > 1) { + index_t common_size = input1->size(); + index_t diff_size = input0->size() / common_size; + TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, + diff_size, common_size, swapped, output_ptr); + } else { + TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_, + input0->size(), swapped, output_ptr); + } + } + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + EltwiseType type_; + std::vector coeff_; + float scalar_input_; + int32_t scalar_input_index_; + DataFormat data_format_; + Tensor scalar_tensor_; +}; + +template <> +class EltwiseOp : public Operation { + public: + explicit EltwiseOp(OpConstructContext *context) + : Operation(context), + type_(static_cast(Operation::GetOptionalArg( + "type", static_cast(kernels::EltwiseType::NONE)))), + coeff_(Operation::GetRepeatedArgs("coeff")), + scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), + scalar_input_index_(Operation::GetOptionalArg( + "scalar_input_index", 1)), + data_format_(static_cast(Operation::GetOptionalArg( + "data_format", 0))) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; + Tensor *output = this->Output(0); + MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); + MACE_CHECK(input0->size() == input1->size(), + "input0 and input1 must have the same shape."); + MACE_CHECK(output->scale() != 0); + MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); + + constexpr int left_shift = 20; + const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); + const double adjusted_input0_scale = input0->scale() / doubled_scale; + const double adjusted_input1_scale = input1->scale() / doubled_scale; + const double adjusted_output_scale = + doubled_scale / ((1 << left_shift) * output->scale()); + + int32_t input0_multiplier; + int32_t input1_multiplier; + int32_t output_multiplier; + int32_t input0_shift; + int32_t input1_shift; + int32_t output_shift; + QuantizeMultiplier(adjusted_input0_scale, + &input0_multiplier, + &input0_shift); + QuantizeMultiplier(adjusted_input1_scale, + &input1_multiplier, + &input1_shift); + QuantizeMultiplier(adjusted_output_scale, + &output_multiplier, + &output_shift); + + Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); + Tensor::MappingGuard output_guard(output); + + auto input0_ptr = input0->data(); + auto input1_ptr = input1->data(); + auto output_ptr = output->mutable_data(); + + index_t handled_output_size = 0; +#ifdef MACE_ENABLE_NEON + #pragma omp parallel for + for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { + const auto input0_val = vld1_u8(input0_ptr + i); + const auto input1_val = vld1_u8(input1_ptr + i); + const auto input0_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input0_val)); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val)); + const auto offset_input0 = + vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); + const auto offset_input1 = + vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); + auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); + auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); + auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); + auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); + const auto left_shift_dup = vdupq_n_s32(left_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); + input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); + input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); + input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); + input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); + const auto input0_shift_dup = vdupq_n_s32(input0_shift); + const auto input1_shift_dup = vdupq_n_s32(input1_shift); + input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); + input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); + input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); + input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); + auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32); + auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32); + sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier); + sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier); + sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift); + sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift); + const auto sum_low_s16 = vmovn_s32(sum_low); + const auto sum_high_s16 = vmovn_s32(sum_high); + const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16, + sum_high_s16), + vdupq_n_s16(output->zero_point())); + vst1_u8(output_ptr + i, vqmovun_s16(output_val)); + } + handled_output_size = output->size() - output->size() % 8; +#endif // NEON +#pragma omp parallel for + for (index_t i = handled_output_size; i < output->size(); ++i) { + const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); + const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); + const int32_t shifted_input0 = offset_input0 * (1 << left_shift); + const int32_t shifted_input1 = offset_input1 * (1 << left_shift); + const int32_t multiplied_input0 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, + input0_multiplier), + -input0_shift); + const int32_t multiplied_input1 = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, + input1_multiplier), + -input1_shift); + const int32_t sum = multiplied_input0 + multiplied_input1; + const int32_t output_val = + gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(sum, + output_multiplier), + -output_shift) + output->zero_point(); + output_ptr[i] = Saturate(output_val); + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + EltwiseType type_; + std::vector coeff_; + float scalar_input_; + int32_t scalar_input_index_; + DataFormat data_format_; + Tensor scalar_tensor_; +}; + +#ifdef MACE_ENABLE_OPENCL +template +class EltwiseOp : public Operation { + public: + explicit EltwiseOp(OpConstructContext *context) + : Operation(context) { + EltwiseType type = static_cast( + Operation::GetOptionalArg( + "type", static_cast(kernels::EltwiseType::NONE))); + std::vector coeff = Operation::GetRepeatedArgs("coeff"); + float scalar_input = Operation::GetOptionalArg("scalar_input", 1.0); + int32_t scalar_input_index = Operation::GetOptionalArg( + "scalar_input_index", 1); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::EltwiseKernel( + type, coeff, scalar_input, scalar_input_index)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; + Tensor *output = this->Output(0); + return kernel_->Compute(context, input0, input1, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterEltwise(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, + DeviceType::CPU, int32_t); + + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, + DeviceType::CPU, uint8_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index d507011a2ed74b93ce105100590d6f20a3bbc899..b71f4e42131d9c86fdd68169ad88a39ddf53c369 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -15,18 +15,6 @@ #ifndef MACE_KERNELS_ELTWISE_H_ #define MACE_KERNELS_ELTWISE_H_ -#include -#include -#include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/utils/quantize.h" - namespace mace { namespace kernels { @@ -45,1071 +33,7 @@ enum EltwiseType { NONE = 11, }; -static bool IsLogicalType(EltwiseType type) { return type == EQUAL; } - -inline index_t GetIndex(const std::vector &shape, - const std::vector &index) { - index_t idx = 0; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] > 1) { - idx = idx * shape[i] + index[i]; - } - } - return idx; -} - -inline void IncreaseIndex(const std::vector &shape, - std::vector *index) { - for (index_t i = static_cast(shape.size()) - 1; i >= 0; --i) { - ++(*index)[i]; - if ((*index)[i] >= shape[i]) { - (*index)[i] -= shape[i]; - } else { - break; - } - } -} - -template -inline void TensorGeneralBroadcastEltwise( - const EltwiseType type, - const T *input0, - const T *input1, - const std::vector &coeff, - const bool swapped, - const std::vector &input0_shape, - const std::vector &input1_shape, - const std::vector &output_shape, - DstType *output) { - const index_t output_size = std::accumulate( - output_shape.begin(), output_shape.end(), 1, std::multiplies()); - std::vector out_index(output_shape.size(), 0); - switch (type) { - case SUM: - if (coeff.empty()) { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input0[idx0] + input1[idx1]; - IncreaseIndex(output_shape, &out_index); - } - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = - input0[idx0] * coeff_copy[0] + input1[idx1] * coeff_copy[1]; - IncreaseIndex(output_shape, &out_index); - } - } - break; - case SUB: - if (!swapped) { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input0[idx0] - input1[idx1]; - IncreaseIndex(output_shape, &out_index); - } - } else { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input1[idx1] - input0[idx0]; - IncreaseIndex(output_shape, &out_index); - } - } - break; - case PROD: - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input0[idx0] * input1[idx1]; - IncreaseIndex(output_shape, &out_index); - } - break; - case DIV: - if (!swapped) { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input0[idx0] / input1[idx1]; - IncreaseIndex(output_shape, &out_index); - } - } else { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input1[idx1] / input0[idx0]; - IncreaseIndex(output_shape, &out_index); - } - } - break; - case MIN: - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = std::min(input1[idx1], input0[idx0]); - IncreaseIndex(output_shape, &out_index); - } - break; - case MAX: - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = std::max(input1[idx1], input0[idx0]); - IncreaseIndex(output_shape, &out_index); - } - break; - case SQR_DIFF: - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = std::pow(input1[idx1] - input0[idx0], 2.f); - IncreaseIndex(output_shape, &out_index); - } - break; - case POW: - if (!swapped) { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = std::pow(input0[idx0], input1[idx1]); - IncreaseIndex(output_shape, &out_index); - } - } else { - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = std::pow(input1[idx1], input0[idx0]); - IncreaseIndex(output_shape, &out_index); - } - } - break; - case EQUAL: - for (index_t i = 0; i < output_size; ++i) { - const index_t idx0 = GetIndex(input0_shape, out_index); - const index_t idx1 = GetIndex(input1_shape, out_index); - output[i] = input1[idx1] == input0[idx0]; - IncreaseIndex(output_shape, &out_index); - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} - -template -inline void TensorBroadcastEltwise(const EltwiseType type, - const T *input0, - const T *input1, - const std::vector &coeff, - const index_t diff_size, - const index_t common_size, - const bool swapped, - DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] + input1[i]; - } - } - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] * coeff_copy[0] + - input1[i] * coeff_copy[1]; - } - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] - input1[i]; - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input1[i] - input0[i + d * common_size]; - } - } - } - break; - case PROD: -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = input0[i + d * common_size] * input1[i]; - } - } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] / input1[i]; - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input1[i] / input0[i + d * common_size]; - } - } - } - break; - case MIN: -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::min(input0[i + d * common_size], input1[i]); - } - } - break; - case MAX: -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::max(input0[i + d * common_size], input1[i]); - } - } - break; - case SQR_DIFF: -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::pow(input0[i + d * common_size] - input1[i], 2.f); - } - } - break; - case POW: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::pow(input0[i + d * common_size], input1[i]); - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - std::pow(input1[i], input0[i + d * common_size]); - } - } - } - break; - case NEG: -#pragma omp parallel for - for (index_t i = 0; i < diff_size * common_size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for - for (index_t i = 0; i < diff_size * common_size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for collapse(2) - for (index_t d = 0; d < diff_size; ++d) { - for (index_t i = 0; i < common_size; ++i) { - output[i + d * common_size] = - input0[i + d * common_size] == input1[i]; - } - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} - -// Multiplication is costly, so we specialize the following case. -template -inline void TensorEltwise(const EltwiseType type, - const T *input0, - const T *input1, - const std::vector &coeff, - const index_t size, - const bool swapped, - DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + input1[i]; - } - - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - input1[i]; - } - - } else { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input1[i] - input0[i]; - } - } - break; - case PROD: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * input1[i]; - } - - break; - case DIV: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / input1[i]; - } - - } else { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input1[i] / input0[i]; - } - } - break; - case MIN: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], input1[i]); - } - - break; - case MAX: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], input1[i]); - } - - break; - case SQR_DIFF: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - input1[i], 2.f); - } - - break; - case POW: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], input1[i]); - } - } else { - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input1[i], input0[i]); - } - } - break; - case NEG: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] == input1[i]; - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} - -// Multiplication is costly, so we specialize the following case. -template -inline void TensorScalarEltwise(const EltwiseType type, - const T *input0, - const T input1, - const std::vector &coeff, - const index_t size, - const bool swapped, - DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + input1; - } - - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - input1; - } - - } else { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input1 - input0[i]; - } - } - break; - case PROD: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * input1; - } - - break; - case DIV: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / input1; - } - - } else { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input1 / input0[i]; - } - } - break; - case MIN: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], input1); - } - - break; - case MAX: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], input1); - } - - break; - case SQR_DIFF: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - input1, 2.f); - } - - break; - case POW: - if (!swapped) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], input1); - } - } else { - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input1, input0[i]); - } - } - break; - case NEG: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] == input1; - } - - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} - -template -inline void TensorEltwisePerChannel(const EltwiseType type, - const T *input0, - const T *input1, - const std::vector &coeff, - const index_t batch0, - const index_t batch1, - const index_t channel, - const index_t image_size, - const bool swapped, - DstType *output) { - switch (type) { - case SUM: - if (coeff.empty()) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] + in1_ptr[c]; - } - } - } - } else { - std::vector coeff_copy = coeff; - if (swapped) { - std::swap(coeff_copy[0], coeff_copy[1]); - } -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = - in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1]; - } - } - } - } - break; - case SUB: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] - in1_ptr[c]; - } - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in1_ptr[c] - in0_ptr[i]; - } - } - } - } - break; - case PROD: -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] * in1_ptr[c]; - } - } - } - break; - case DIV: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] / in1_ptr[c]; - } - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in1_ptr[c] / in0_ptr[i]; - } - } - } - } - break; - case MIN: -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]); - } - } - } - break; - case MAX: -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]); - } - } - } - break; - case SQR_DIFF: -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f); - } - } - } - break; - case POW: - if (!swapped) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]); - } - } - } - } else { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]); - } - } - } - } - break; - case NEG: -#pragma omp parallel for - for (index_t i = 0; i < batch0 * channel * image_size; ++i) { - output[i] = -input0[i]; - } - break; - case ABS: -#pragma omp parallel for - for (index_t i = 0; i < batch0 * channel * image_size; ++i) { - output[i] = std::fabs(input0[i]); - } - break; - case EQUAL: -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch0; ++b) { - for (index_t c = 0; c < channel; ++c) { - const T *in0_ptr = input0 + ((b * channel) + c) * image_size; - const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); - DstType *out_ptr = output + ((b * channel) + c) * image_size; - for (index_t i = 0; i < image_size; ++i) { - out_ptr[i] = in0_ptr[i] == in1_ptr[c]; - } - } - } - break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} - -template -struct EltwiseFunctor : OpKernel { - EltwiseFunctor(OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, // float as it comes from arg - const int32_t scalar_input_index, - const DataFormat data_format) - : OpKernel(context), - type_(type), - coeff_(coeff), - scalar_input_(scalar_input), - scalar_input_index_(scalar_input_index), - data_format_(data_format) {} - - template - MaceStatus DoEltwise(const Tensor *input0, - const Tensor *input1, - Tensor *output) { - bool swapped = false; - if (input0->size() < input1->size()) { - std::swap(input0, input1); - swapped = true; - } - if (scalar_input_index_ == 0) { - swapped = !swapped; - } - - // check if we can broadcast tensor - uint32_t rank_diff = - static_cast(input0->dim_size() - input1->dim_size()); - if (data_format_ == NCHW) { - MACE_CHECK( - (input0->dim_size() == 4) && - ((input1->dim_size() == 0) || - (input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) && - (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1)) || - (input1->dim_size() == 1 && input1->dim(0) == input0->dim(1))), - "only support broadcast channel dimension"); - } else { - for (uint32_t i = 0; i < input1->dim_size(); ++i) { - MACE_CHECK(input0->dim(rank_diff + i) == 1 || input1->dim(i) == 1 || - input0->dim(rank_diff + i) == input1->dim(i), - "Element-Wise op only support tail dimensions broadcast"); - } - } - - Tensor::MappingGuard input0_guard(input0); - Tensor::MappingGuard input1_guard(input1); - - const T *input0_ptr = input0->data(); - const T *input1_ptr = input1->data(); - - if (data_format_ == NCHW && input1->dim_size() > 0 && - input1->size() < input0->size()) { - MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); - Tensor::MappingGuard output_guard(output); - DstType *output_ptr = output->mutable_data(); - TensorEltwisePerChannel( - type_, input0_ptr, input1_ptr, coeff_, input0->dim(0), - input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1), - input0->dim(2) * input0->dim(3), swapped, output_ptr); - - } else { - const std::vector &input0_shape = input0->shape(); - std::vector input1_shape(rank_diff, 1); - input1_shape.insert(input1_shape.end(), input1->shape().begin(), - input1->shape().end()); - - std::vector output_shape(input0->dim_size(), 0); - for (unsigned int i = 0; i < input0_shape.size(); ++i) { - output_shape[i] = std::max(input0_shape[i], input1_shape[i]); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - Tensor::MappingGuard output_guard(output); - DstType *output_ptr = output->mutable_data(); - - bool need_general_broadcast = false; - for (uint32_t i = 0; i < input1->dim_size(); ++i) { - if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1) || - (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) { - need_general_broadcast = true; - break; - } - } - - if (need_general_broadcast) { - TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, - swapped, input0_shape, input1_shape, - output_shape, output_ptr); - } else if (input1->size() == input0->size()) { - TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(), - swapped, output_ptr); - } else if (input1->size() < input0->size()) { - if (input1->size() > 1) { - index_t common_size = input1->size(); - index_t diff_size = input0->size() / common_size; - TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, - diff_size, common_size, swapped, output_ptr); - } else { - TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_, - input0->size(), swapped, output_ptr); - } - } - } - - return MACE_SUCCESS; - } - - MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - - if (input1 == nullptr) { - scalar_tensor_.Resize({}); - Tensor::MappingGuard guard(&scalar_tensor_); - auto scalar_data = scalar_tensor_.mutable_data(); - scalar_data[0] = static_cast(scalar_input_); - input1 = &scalar_tensor_; - } - - if (IsLogicalType(type_)) { - // as we do not have bool-type tensor, we use int type - return DoEltwise(input0, input1, output); - } else { - return DoEltwise(input0, input1, output); - } - } - - EltwiseType type_; - std::vector coeff_; - float scalar_input_; - int32_t scalar_input_index_; - DataFormat data_format_; - Tensor scalar_tensor_; -}; - -template <> -struct EltwiseFunctor : OpKernel { - EltwiseFunctor(OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, // float as it comes from arg - const int32_t scalar_input_index, - const DataFormat data_format) - : OpKernel(context), - type_(type), - coeff_(coeff), - scalar_input_(scalar_input), - scalar_input_index_(scalar_input_index), - data_format_(data_format) {} - - MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - - MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. "); - MACE_CHECK(input0->size() == input1->size(), - "input0 and input1 must have the same shape."); - MACE_CHECK(output->scale() != 0); - MACE_RETURN_IF_ERROR(output->Resize(input0->shape())); - - constexpr int left_shift = 20; - const double doubled_scale = 2 * std::max(input0->scale(), input1->scale()); - const double adjusted_input0_scale = input0->scale() / doubled_scale; - const double adjusted_input1_scale = input1->scale() / doubled_scale; - const double adjusted_output_scale = - doubled_scale / ((1 << left_shift) * output->scale()); - - int32_t input0_multiplier; - int32_t input1_multiplier; - int32_t output_multiplier; - int32_t input0_shift; - int32_t input1_shift; - int32_t output_shift; - QuantizeMultiplier(adjusted_input0_scale, - &input0_multiplier, - &input0_shift); - QuantizeMultiplier(adjusted_input1_scale, - &input1_multiplier, - &input1_shift); - QuantizeMultiplier(adjusted_output_scale, - &output_multiplier, - &output_shift); - - Tensor::MappingGuard input0_guard(input0); - Tensor::MappingGuard input1_guard(input1); - Tensor::MappingGuard output_guard(output); - - auto input0_ptr = input0->data(); - auto input1_ptr = input1->data(); - auto output_ptr = output->mutable_data(); - - index_t handled_output_size = 0; -#ifdef MACE_ENABLE_NEON -#pragma omp parallel for - for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { - const auto input0_val = vld1_u8(input0_ptr + i); - const auto input1_val = vld1_u8(input1_ptr + i); - const auto input0_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input0_val)); - const auto input1_val_s16 = - vreinterpretq_s16_u16(vmovl_u8(input1_val)); - const auto offset_input0 = - vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point())); - const auto offset_input1 = - vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point())); - auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0)); - auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0)); - auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1)); - auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1)); - const auto left_shift_dup = vdupq_n_s32(left_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup); - input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier); - input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier); - input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier); - input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier); - const auto input0_shift_dup = vdupq_n_s32(input0_shift); - const auto input1_shift_dup = vdupq_n_s32(input1_shift); - input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup); - input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup); - input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup); - input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup); - auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32); - auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32); - sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier); - sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier); - sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift); - sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift); - const auto sum_low_s16 = vmovn_s32(sum_low); - const auto sum_high_s16 = vmovn_s32(sum_high); - const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16, - sum_high_s16), - vdupq_n_s16(output->zero_point())); - vst1_u8(output_ptr + i, vqmovun_s16(output_val)); - } - handled_output_size = output->size() - output->size() % 8; -#endif // NEON -#pragma omp parallel for - for (index_t i = handled_output_size; i < output->size(); ++i) { - const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); - const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); - const int32_t shifted_input0 = offset_input0 * (1 << left_shift); - const int32_t shifted_input1 = offset_input1 * (1 << left_shift); - const int32_t multiplied_input0 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0, - input0_multiplier), - -input0_shift); - const int32_t multiplied_input1 = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1, - input1_multiplier), - -input1_shift); - const int32_t sum = multiplied_input0 + multiplied_input1; - const int32_t output_val = - gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(sum, - output_multiplier), - -output_shift) + output->zero_point(); - output_ptr[i] = Saturate(output_val); - } - - return MACE_SUCCESS; - } - - EltwiseType type_; - std::vector coeff_; - float scalar_input_; - int32_t scalar_input_index_; - DataFormat data_format_; - Tensor scalar_tensor_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLEltwiseKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel); -}; -template -struct EltwiseFunctor : OpKernel { - EltwiseFunctor(OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index, - const DataFormat data_format); - - MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL +inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; } } // namespace kernels } // namespace mace diff --git a/mace/kernels/expand_dims.h b/mace/kernels/expand_dims.cc similarity index 62% rename from mace/kernels/expand_dims.h rename to mace/kernels/expand_dims.cc index 05cac1255f29e430ec3ba20b2beaa33b72a4ff9e..5dc58436ac18b7b37fa27dab72af9d52d64fbd69 100644 --- a/mace/kernels/expand_dims.h +++ b/mace/kernels/expand_dims.cc @@ -12,35 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_EXPAND_DIMS_H_ -#define MACE_KERNELS_EXPAND_DIMS_H_ -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL +#include "mace/core/operator.h" namespace mace { namespace kernels { -template -struct ExpandDimsFunctor; +template +class ExpandDimsOp; template -struct ExpandDimsFunctor : OpKernel { - explicit ExpandDimsFunctor(OpKernelContext *context, int axis) - : OpKernel(context), axis_(axis) {} - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +class ExpandDimsOp : public Operation { + public: + explicit ExpandDimsOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 0)) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); index_t input_dims_size = input->dim_size(); if ( axis_ < 0 ) { axis_ += input_dims_size + 1; @@ -58,13 +49,23 @@ struct ExpandDimsFunctor : OpKernel { output->ReuseTensorBuffer(*input); output->Reshape(output_shape); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: int axis_; }; +void RegisterExpandDims(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, + DeviceType::CPU, int32_t); + + MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, + DeviceType::CPU, uint8_t); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_EXPAND_DIMS_H_ diff --git a/mace/kernels/fill.h b/mace/kernels/fill.cc similarity index 69% rename from mace/kernels/fill.h rename to mace/kernels/fill.cc index 131dd9d4bffc8f851dd22e1f1a1603defc3d5bb2..0cd209307f1b78c246a3bc34a5e178e34d7955b0 100644 --- a/mace/kernels/fill.h +++ b/mace/kernels/fill.cc @@ -12,34 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_FILL_H_ -#define MACE_KERNELS_FILL_H_ -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { template -struct FillFunctor; +class FillOp; template <> -struct FillFunctor : OpKernel { - explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {} - - MaceStatus operator()(const Tensor *shape, - const Tensor *value, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +class FillOp : public Operation { + public: + explicit FillOp(OpConstructContext *context) + : Operation(context) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *shape = this->Input(SHAPE); + const Tensor *value = this->Input(VALUE); + Tensor *output = this->Output(OUTPUT); MACE_CHECK(shape->dim_size() == 1, "Shape must be 1-D"); const index_t num_dims = shape->dim(0); Tensor::MappingGuard shape_guard(shape); @@ -61,11 +53,18 @@ struct FillFunctor : OpKernel { std::fill(output_data, output_data + output->size(), *value_data); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + + private: + MACE_OP_INPUT_TAGS(SHAPE, VALUE); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; +void RegisterFill(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Fill", FillOp, + DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_FILL_H_ diff --git a/mace/kernels/fully_connected.cc b/mace/kernels/fully_connected.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7b74c69281c3aa93bd79b692fc6ce97b3397fdd --- /dev/null +++ b/mace/kernels/fully_connected.cc @@ -0,0 +1,233 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/operator.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/gemm.h" +#include "mace/kernels/gemmlowp_util.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/fully_connected.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +class FullyConnectedOpBase : public Operation { + public: + explicit FullyConnectedOpBase(OpConstructContext *context) + : Operation(context), + activation_(kernels::StringToActivationType( + Operation::GetOptionalArg("activation", + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) {} + protected: + const ActivationType activation_; + const float relux_max_limit_; + + MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + +template +class FullyConnectedOp; + +template <> +class FullyConnectedOp : public FullyConnectedOpBase { + public: + explicit FullyConnectedOp(OpConstructContext *context) + : FullyConnectedOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(INPUT); + const Tensor *weight = this->Input(WEIGHT); // OIHW + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + + MACE_CHECK( + input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) && + input->dim(3) == weight->dim(3), + "The shape of Input: ", MakeString(input->shape()), + "The shape of Weight: ", MakeString(weight->shape()), + " don't match."); + if (bias) { + MACE_CHECK(weight->dim(0) == bias->dim(0), + "The shape of Weight: ", MakeString(weight->shape()), + " and shape of Bias: ", bias->dim(0), + " don't match."); + } + std::vector output_shape = {input->dim(0), weight->dim(0), 1, 1}; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + const index_t N = output->dim(0); + const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); + const index_t output_size = weight->dim(0); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_weight(weight); + Tensor::MappingGuard guard_output(output); + const float *input_ptr = input->data(); + const float *weight_ptr = weight->data(); + float *output_ptr = output->mutable_data(); + + Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr); + + if (bias) { + Tensor::MappingGuard guard_bias(bias); + const float *bias_ptr = bias == nullptr ? nullptr : bias->data(); + for (int i = 0; i < N; ++i) { + for (int j = 0; j < output_size; ++j) { + output_ptr[j + i * output_size] += bias_ptr[j]; + } + } + } + + DoActivation(output_ptr, output_ptr, output->size(), activation_, + relux_max_limit_); + + return MaceStatus::MACE_SUCCESS; + } +}; + +template <> +class FullyConnectedOp + : public FullyConnectedOpBase { + public: + explicit FullyConnectedOp(OpConstructContext *context) + : FullyConnectedOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *weight = this->Input(WEIGHT); // OIHW + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + + MACE_CHECK( + input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) && + input->dim(3) == weight->dim(3), + "The shape of Input: ", MakeString(input->shape()), + "The shape of Weight: ", MakeString(weight->shape()), + " don't match."); + if (bias) { + MACE_CHECK(weight->dim(0) == bias->dim(0), + "The shape of Weight: ", MakeString(weight->shape()), + " and shape of Bias: ", bias->dim(0), + " don't match."); + } + auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext(); + MACE_CHECK_NOTNULL(gemm_context); + + std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + const int N = static_cast(output->dim(0)); + const int input_size = + static_cast(weight->dim(1) * weight->dim(2) * weight->dim(3)); + const int output_size = static_cast(weight->dim(0)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_weight(weight); + Tensor::MappingGuard guard_output(output); + auto input_ptr = input->data(); + auto weight_ptr = weight->data(); + auto output_ptr = output->mutable_data(); + + std::vector bias_shape{output_size}; + std::unique_ptr zero_bias; + const int32_t *bias_ptr = nullptr; + if (bias == nullptr) { + zero_bias.reset( + new Tensor(GetCPUAllocator(), DT_INT32)); + zero_bias->Resize(bias_shape); + zero_bias->Clear(); + bias_ptr = zero_bias->data(); + } else { + bias_ptr = bias->data(); + } + + gemmlowp::MatrixMap + weight_matrix(weight_ptr, output_size, input_size); + gemmlowp::MatrixMap + input_matrix(input_ptr, input_size, N); + gemmlowp::MatrixMap + output_matrix(output_ptr, output_size, N); + + const auto &output_pipeline = GemmlowpOutputPipeline::Make( + bias_ptr, output_size, weight->scale(), input->scale(), output->scale(), + output->zero_point()); + + using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; + gemmlowp::GemmWithOutputPipeline( + gemm_context, weight_matrix, input_matrix, &output_matrix, + -weight->zero_point(), -input->zero_point(), output_pipeline); + + return MaceStatus::MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +class FullyConnectedOp : public FullyConnectedOpBase { + public: + explicit FullyConnectedOp(OpConstructContext *context) + : FullyConnectedOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::FullyConnectedKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(INPUT); + const Tensor *weight = this->Input(WEIGHT); // OIHW + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + + MACE_CHECK( + input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) && + input->dim(3) == weight->dim(1), + "The shape of Input: ", MakeString(input->shape()), + "The shape of Weight: ", MakeString(weight->shape()), + " don't match."); + return kernel_->Compute( + context, input, weight, bias, activation_, relux_max_limit_, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterFullyConnected(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "FullyConnected", + FullyConnectedOp, DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "FullyConnected", + FullyConnectedOp, DeviceType::CPU, uint8_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "FullyConnected", + FullyConnectedOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "FullyConnected", + FullyConnectedOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h deleted file mode 100644 index 20a572cbc52fa7ae241c6d77f9e0b9bf2b42e000..0000000000000000000000000000000000000000 --- a/mace/kernels/fully_connected.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_FULLY_CONNECTED_H_ -#define MACE_KERNELS_FULLY_CONNECTED_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/gemm.h" -#include "mace/kernels/gemmlowp_util.h" - -namespace mace { -namespace kernels { - -struct FullyConnectedBase : OpKernel { - FullyConnectedBase(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - - const ActivationType activation_; - const float relux_max_limit_; -}; - -template -struct FullyConnectedFunctor; - -template <> -struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit) - : FullyConnectedBase(context, activation, relux_max_limit) {} - - MaceStatus operator()(const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - std::vector output_shape = {input->dim(0), weight->dim(0), 1, 1}; - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - const index_t N = output->dim(0); - const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3); - const index_t output_size = weight->dim(0); - - Tensor::MappingGuard guard_input(input); - Tensor::MappingGuard guard_weight(weight); - Tensor::MappingGuard guard_output(output); - const float *input_ptr = input->data(); - const float *weight_ptr = weight->data(); - float *output_ptr = output->mutable_data(); - - Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr); - - if (bias) { - Tensor::MappingGuard guard_bias(bias); - const float *bias_ptr = bias == nullptr ? nullptr : bias->data(); - for (int i = 0; i < N; ++i) { - for (int j = 0; j < output_size; ++j) { - output_ptr[j + i * output_size] += bias_ptr[j]; - } - } - } - - DoActivation(output_ptr, output_ptr, output->size(), activation_, - relux_max_limit_); - - return MACE_SUCCESS; - } -}; - -template <> -struct FullyConnectedFunctor: FullyConnectedBase { - FullyConnectedFunctor(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit) - : FullyConnectedBase(context, activation, relux_max_limit) {} - - MaceStatus operator()(const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext(); - MACE_CHECK_NOTNULL(gemm_context); - - std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - const int N = static_cast(output->dim(0)); - const int input_size = - static_cast(weight->dim(1) * weight->dim(2) * weight->dim(3)); - const int output_size = static_cast(weight->dim(0)); - - Tensor::MappingGuard guard_input(input); - Tensor::MappingGuard guard_weight(weight); - Tensor::MappingGuard guard_output(output); - auto input_ptr = input->data(); - auto weight_ptr = weight->data(); - auto output_ptr = output->mutable_data(); - - std::vector bias_shape{output_size}; - std::unique_ptr zero_bias; - const int32_t *bias_ptr = nullptr; - if (bias == nullptr) { - zero_bias.reset( - new Tensor(GetCPUAllocator(), DT_INT32)); - zero_bias->Resize(bias_shape); - zero_bias->Clear(); - bias_ptr = zero_bias->data(); - } else { - bias_ptr = bias->data(); - } - - gemmlowp::MatrixMap - weight_matrix(weight_ptr, output_size, input_size); - gemmlowp::MatrixMap - input_matrix(input_ptr, input_size, N); - gemmlowp::MatrixMap - output_matrix(output_ptr, output_size, N); - - const auto &output_pipeline = GemmlowpOutputPipeline::Make( - bias_ptr, output_size, weight->scale(), input->scale(), output->scale(), - output->zero_point()); - - using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; - gemmlowp::GemmWithOutputPipeline( - gemm_context, weight_matrix, input_matrix, &output_matrix, - -weight->zero_point(), -input->zero_point(), output_pipeline); - - return MACE_SUCCESS; - } -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLFullyConnectedKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *weight, - const Tensor *bias, - const ActivationType activation, - const float relux_max_limit, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel); -}; -template -struct FullyConnectedFunctor : FullyConnectedBase { - FullyConnectedFunctor(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit); - - MaceStatus operator()(const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_FULLY_CONNECTED_H_ diff --git a/mace/kernels/gather.h b/mace/kernels/gather.cc similarity index 76% rename from mace/kernels/gather.h rename to mace/kernels/gather.cc index d8978a63fc497cd6ae218377e955c51e3d1c07c7..ff947e820cfad7536c3d73ad1d8fe3b4366b11f9 100644 --- a/mace/kernels/gather.h +++ b/mace/kernels/gather.cc @@ -12,43 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_GATHER_H_ -#define MACE_KERNELS_GATHER_H_ - #include -#include -#include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { -struct GatherBase : OpKernel { - GatherBase(OpKernelContext *context, int axis, float y) - : OpKernel(context), axis_(axis), y_(y) {} - - int axis_; - float y_; -}; - -template -struct GatherFunctor; +template +class GatherOp; template <> -struct GatherFunctor : GatherBase { - GatherFunctor(OpKernelContext *context, int axis, float y) - : GatherBase(context, axis, y) {} - - MaceStatus operator()(const Tensor *params, - const Tensor *indices, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +class GatherOp : public Operation { + public: + explicit GatherOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 0)), + y_(Operation::GetOptionalArg("y", 1.0)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *params = this->Input(PARAMS); + const Tensor *indices = this->Input(INDICES); + Tensor *output = this->Output(OUTPUT); std::vector output_shape; if (axis_ < 0) { axis_ += params->dim_size(); @@ -99,11 +85,20 @@ struct GatherFunctor : GatherBase { } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + + private: + int axis_; + float y_; + MACE_OP_INPUT_TAGS(PARAMS, INDICES); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; +void RegisterGather(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Gather", GatherOp, + DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_GATHER_H_ diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h index 863359553fe0b4a9c65e9f67e8766f0ff6c9679f..8a0148e1e21f7be80c493d1c9ed97885f5dca3e4 100644 --- a/mace/kernels/gemmlowp_util.h +++ b/mace/kernels/gemmlowp_util.h @@ -18,7 +18,8 @@ #include #include "public/gemmlowp.h" -#include "mace/kernels/quantize.h" +#include "mace/core/types.h" +#include "mace/utils/quantize.h" namespace mace { diff --git a/mace/kernels/identity.cc b/mace/kernels/identity.cc new file mode 100644 index 0000000000000000000000000000000000000000..1fba94bdd50d9dd6927919e0dc92c35687c5bf2c --- /dev/null +++ b/mace/kernels/identity.cc @@ -0,0 +1,50 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "mace/core/operator.h" + +namespace mace { +namespace kernels { + +template +class IdentityOp : public Operation { + public: + explicit IdentityOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + output->ReuseTensorBuffer(*input); + return MaceStatus::MACE_SUCCESS; + } +}; + +void RegisterIdentity(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, + DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, + DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "Identity", IdentityOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/ops/infer_conv2d_shape.h b/mace/kernels/infer_conv2d_shape.cc similarity index 68% rename from mace/ops/infer_conv2d_shape.h rename to mace/kernels/infer_conv2d_shape.cc index 6d1fdf4fc0d24063a54c2d6ff299bf0aa45b9c19..0e80aa61e2c93664c2d1ed9caddfbe5b2b894c64 100644 --- a/mace/ops/infer_conv2d_shape.h +++ b/mace/kernels/infer_conv2d_shape.cc @@ -12,44 +12,41 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_INFER_CONV2D_SHAPE_H_ -#define MACE_OPS_INFER_CONV2D_SHAPE_H_ - -#include #include "mace/core/operator.h" #include "mace/kernels/conv_pool_2d_util.h" namespace mace { -namespace ops { +namespace kernels { -template -class InferConv2dShapeOp : public Operator { +template +class InferConv2dShapeOp : public Operation { public: - InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context) {} + explicit InferConv2dShapeOp(OpConstructContext *context) + : Operation(context) {} - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); MACE_CHECK(input->dim_size() == 4); output->Resize({input->dim_size()}); Tensor::MappingGuard output_guard(output); int32_t *output_data = output->mutable_data(); const int32_t data_format = - OperatorBase::GetOptionalArg("data_format", 0); + Operation::GetOptionalArg("data_format", 0); const bool isNCHW = data_format == 1; Padding padding_type = - static_cast(OperatorBase::GetOptionalArg( + static_cast(Operation::GetOptionalArg( "padding", static_cast(SAME))); const std::vector paddings = - OperatorBase::GetRepeatedArgs("padding_values"); + Operation::GetRepeatedArgs("padding_values"); const std::vector kernels = - OperatorBase::GetRepeatedArgs("kernels"); + Operation::GetRepeatedArgs("kernels"); const std::vector strides = - OperatorBase::GetRepeatedArgs("strides", {1, 1}); + Operation::GetRepeatedArgs("strides", {1, 1}); const int32_t out_batch = static_cast(input->dim(0)); const int32_t out_channel = static_cast(kernels[0]); @@ -97,17 +94,22 @@ class InferConv2dShapeOp : public Operator { output_data[3] = out_channel; } - SetFutureDefaultWaitFn(future); - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } - - private: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterInferConv2dShape(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "InferConv2dShape", + InferConv2dShapeOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} -#endif // MACE_OPS_INFER_CONV2D_SHAPE_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.cc similarity index 56% rename from mace/kernels/local_response_norm.h rename to mace/kernels/local_response_norm.cc index d53b8f08c6507cf0f5d8272ddd2e597e73435e12..6a51ccb368284c8ef4cc0b48e0fc76c9c1aca122 100644 --- a/mace/kernels/local_response_norm.h +++ b/mace/kernels/local_response_norm.cc @@ -12,40 +12,36 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_LOCAL_RESPONSE_NORM_H_ -#define MACE_KERNELS_LOCAL_RESPONSE_NORM_H_ - #include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL +#include "mace/core/operator.h" namespace mace { namespace kernels { -template -struct LocalResponseNormFunctor; - -template<> -struct LocalResponseNormFunctor : OpKernel { - explicit LocalResponseNormFunctor(OpKernelContext *context) - : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - int depth_radius, - float bias, - float alpha, - float beta, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template +class LocalResponseNormOp; + +template <> +class LocalResponseNormOp : public Operation { + public: + explicit LocalResponseNormOp(OpConstructContext *context) + : Operation(context), + depth_radius_(Operation::GetOptionalArg("depth_radius", 5)), + bias_(Operation::GetOptionalArg("bias", 1.0f)), + alpha_(Operation::GetOptionalArg("alpha", 1.0f)), + beta_(Operation::GetOptionalArg("beta", 0.5f)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", + input->dim_size()); + + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + const index_t batch = input->dim(0); const index_t channels = input->dim(1); const index_t height = input->dim(2); @@ -61,8 +57,8 @@ struct LocalResponseNormFunctor : OpKernel { for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const int begin_input_c = std::max(static_cast(0), - c - depth_radius); - const int end_input_c = std::min(channels, c + depth_radius + 1); + c - depth_radius_); + const int end_input_c = std::min(channels, c + depth_radius_ + 1); index_t pos = b * batch_size; for (index_t hw = 0; hw < height * width; ++hw, ++pos) { @@ -71,18 +67,27 @@ struct LocalResponseNormFunctor : OpKernel { const float input_val = input_ptr[pos + input_c * image_size]; accum += input_val * input_val; } - const float multiplier = std::pow(bias + alpha * accum, -beta); + const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_); output_ptr[pos + c * image_size] = - input_ptr[pos + c * image_size] * multiplier; + input_ptr[pos + c * image_size] * multiplier; } } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + + private: + int depth_radius_; + float bias_; + float alpha_; + float beta_; }; +void RegisterLocalResponseNorm(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "LocalResponseNorm", + LocalResponseNormOp, DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_LOCAL_RESPONSE_NORM_H_ diff --git a/mace/ops/lstmcell.h b/mace/kernels/lstm_cell.cc similarity index 50% rename from mace/ops/lstmcell.h rename to mace/kernels/lstm_cell.cc index 3037c891ff5a9b7d9fb25096632556cce4193296..be7f50d904d86acf0bb7deca2bd12e2ceef8d4df 100644 --- a/mace/ops/lstmcell.h +++ b/mace/kernels/lstm_cell.cc @@ -12,28 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_LSTMCELL_H_ -#define MACE_OPS_LSTMCELL_H_ - -#include +#include +#include #include "mace/core/operator.h" -#include "mace/kernels/lstmcell.h" +#include "mace/kernels/opencl/image/lstm_cell.h" namespace mace { -namespace ops { +namespace kernels { template -class LSTMCellOp : public Operator { +class LSTMCellOp; + +template +class LSTMCellOp : public Operation { public: - LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - static_cast( - OperatorBase::GetOptionalArg("scalar_input", - 0.0))) {} - - MaceStatus Run(StatsFuture *future) override { + explicit LSTMCellOp(OpConstructContext *context) + : Operation(context) { + T forget_bias = static_cast( + Operation::GetOptionalArg("scalar_input", + 0.0)); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::LSTMCellKernel(forget_bias)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(INPUT); const Tensor *pre_output = this->Input(PRE_OUTPUT); const Tensor *weight = this->Input(WEIGHT); @@ -41,19 +47,24 @@ class LSTMCellOp : public Operator { const Tensor *pre_cell = this->Input(PRE_CELL); Tensor *cell = this->Output(CELL); Tensor *output = this->Output(OUTPUT); + return kernel_->Compute(context, input, pre_output, weight, bias, + pre_cell, cell, output); + } - return functor_( - input, pre_output, weight, bias, pre_cell, cell, output, future); - }; - - protected: - kernels::LSTMCellFunctor functor_; + private: + std::unique_ptr kernel_; MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL); MACE_OP_OUTPUT_TAGS(CELL, OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterLSTMCell(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp, + DeviceType::GPU, half); +} -#endif // MACE_OPS_LSTMCELL_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h deleted file mode 100644 index 81a7f386a215b71511da3566e5442dec6711cb65..0000000000000000000000000000000000000000 --- a/mace/kernels/lstmcell.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_LSTMCELL_H_ -#define MACE_KERNELS_LSTMCELL_H_ - -#include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/runtime/opencl/cl2_header.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" - -#if defined(MACE_ENABLE_NEON) -#include -#endif - -namespace mace { -namespace kernels { - -template -struct LSTMCellFunctor; - -class OpenCLLSTMCellKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const Tensor *pre_output, - const Tensor *weight, - const Tensor *bias, - const Tensor *pre_cell, - Tensor *cell, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel); -}; -template -struct LSTMCellFunctor : OpKernel{ - LSTMCellFunctor(OpKernelContext *context, T forget_bias); - MaceStatus operator()(const Tensor *input, - const Tensor *pre_output, - const Tensor *weight, - const Tensor *bias, - const Tensor *pre_cell, - Tensor *cell, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_LSTMCELL_H_ diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.cc similarity index 57% rename from mace/kernels/matmul.h rename to mace/kernels/matmul.cc index 5dab02c528867295df8cd7b2bf03ed1d2bcede81..4723e6557cd20d41e7d60d61666a26109d515bb2 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.cc @@ -12,13 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_MATMUL_H_ -#define MACE_KERNELS_MATMUL_H_ - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif - #include #include #include @@ -26,27 +19,65 @@ #include #include -#include "mace/core/future.h" +#include "mace/core/operator.h" #include "mace/core/tensor.h" #include "mace/kernels/gemm.h" -#include "mace/kernels/kernel.h" -#include "mace/utils/utils.h" #include "mace/kernels/gemmlowp_util.h" #include "mace/kernels/sgemm.h" +#include "mace/utils/utils.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/matmul.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct MatMulFunctor : OpKernel { - explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b, - StatsFuture *future) { - MACE_UNUSED(future); +class MatMulOpBase : public Operation { + public: + explicit MatMulOpBase(OpConstructContext *context) + : Operation(context), + transpose_a_(Operation::GetOptionalArg("transpose_a", false)), + transpose_b_(Operation::GetOptionalArg("transpose_b", false)) {} + + inline void Validate() { + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2, + "rank(A) should be equal to rank(B), rank should be greater " + "than or equal to 2"); + index_t rank = A->dim_size(); + for (index_t i = 0; i < rank - 2; ++i) { + MACE_CHECK(A->dim(i) == B->dim(i), + "batch dimensions are not equal: ", + A->dim(i), + " vs. ", + B->dim(i)); + } + index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1); + index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2); + MACE_CHECK(ak == bk, "the number of A's column ", ak, + " must be equal to B's row ", bk); + } + + protected: + MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B); + MACE_OP_OUTPUT_TAGS(OUTPUT); + + bool transpose_a_; + bool transpose_b_; +}; + +template +class MatMulOp : public MatMulOpBase { + public: + explicit MatMulOp(OpConstructContext *context) + : MatMulOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + Validate(); + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + Tensor *C = this->Output(OUTPUT); index_t batch; index_t height; @@ -56,10 +87,10 @@ struct MatMulFunctor : OpKernel { index_t rank = A->dim_size(); height = A->dim(rank - 2); K = A->dim(rank - 1); - if (transpose_a) { + if (transpose_a_) { std::swap(height, K); } - if (transpose_b) { + if (transpose_b_) { width = B->dim(rank - 2); } else { width = B->dim(rank - 1); @@ -85,7 +116,7 @@ struct MatMulFunctor : OpKernel { const index_t height_b = B->dim(rank - 2); const index_t width_b = B->dim(rank - 1); - auto scratch_buffer = context_->device()->scratch_buffer(); + auto scratch_buffer = context->device()->scratch_buffer(); scratch_buffer->Rewind(); index_t scratch_size = C->raw_max_size(); if (!A->is_weight()) { @@ -103,30 +134,86 @@ struct MatMulFunctor : OpKernel { width_a, height_b, width_b, - transpose_a, - transpose_b, + transpose_a_, + transpose_b_, A->is_weight(), B->is_weight(), c_ptr_base, - context_->device()->scratch_buffer()); - return MACE_SUCCESS; + context->device()->scratch_buffer()); + return MaceStatus::MACE_SUCCESS; } + private: SGemm sgemm_; }; template <> -struct MatMulFunctor : OpKernel { - explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {} +class MatMulOp: public MatMulOpBase { + public: + explicit MatMulOp(OpConstructContext *context) + : MatMulOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + Validate(); + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + Tensor *C = this->Output(OUTPUT); + + index_t rank = A->dim_size(); + index_t height = A->dim(rank - 2); + index_t K = A->dim(rank - 1); + index_t width; + + if (transpose_a_) { + std::swap(height, K); + } + if (transpose_b_) { + width = B->dim(rank - 2); + } else { + width = B->dim(rank - 1); + } + + std::vector c_shape = A->shape(); + c_shape[rank - 2] = height; + c_shape[rank - 1] = width; + + MACE_RETURN_IF_ERROR(C->Resize(c_shape)); + + constexpr gemmlowp::MapOrder kRowMajor = gemmlowp::MapOrder::RowMajor; + constexpr gemmlowp::MapOrder kColMajor = gemmlowp::MapOrder::ColMajor; + +#define MATMUL_IMPL(AOrder, BOrder) \ + MatMulImpl(context, A, B, height, K, width, C); + + if (transpose_a_) { + if (transpose_b_) { + MATMUL_IMPL(kColMajor, kColMajor); + } else { + MATMUL_IMPL(kColMajor, kRowMajor); + } + } else { + if (transpose_b_) { + MATMUL_IMPL(kRowMajor, kColMajor); + } else { + MATMUL_IMPL(kRowMajor, kRowMajor); + } + } + +#undef MATMUL_IMPL + + return MaceStatus::MACE_SUCCESS; + } + private: template - void MatMulImpl(const Tensor *A, + void MatMulImpl(OpContext *context, + const Tensor *A, const Tensor *B, const index_t height, const index_t K, const index_t width, Tensor *C) { - auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext(); + auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext(); MACE_CHECK_NOTNULL(gemm_context); Tensor::MappingGuard guarda(A); @@ -158,90 +245,48 @@ struct MatMulFunctor : OpKernel { -B->zero_point(), output_pipeline); } } +}; - MaceStatus operator()(const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b, - StatsFuture *future) { - MACE_UNUSED(future); - - index_t rank = A->dim_size(); - index_t height = A->dim(rank - 2); - index_t K = A->dim(rank - 1); - index_t width; - - if (transpose_a) { - std::swap(height, K); - } - if (transpose_b) { - width = B->dim(rank - 2); +#ifdef MACE_ENABLE_OPENCL +template +class MatMulOp : public MatMulOpBase { + public: + explicit MatMulOp(OpConstructContext *context) + : MatMulOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::MatMulKernel); } else { - width = B->dim(rank - 1); + MACE_NOT_IMPLEMENTED; } + } + MaceStatus Run(OpContext *context) override { + Validate(); + const Tensor *A = this->Input(INPUT_A); + const Tensor *B = this->Input(INPUT_B); + Tensor *C = this->Output(OUTPUT); + return kernel_->Compute(context, A, B, C, transpose_a_, transpose_b_); + } - std::vector c_shape = A->shape(); - c_shape[rank - 2] = height; - c_shape[rank - 1] = width; - - MACE_RETURN_IF_ERROR(C->Resize(c_shape)); - - constexpr gemmlowp::MapOrder kRowMajor = gemmlowp::MapOrder::RowMajor; - constexpr gemmlowp::MapOrder kColMajor = gemmlowp::MapOrder::ColMajor; - -#define MATMUL_IMPL(AOrder, BOrder) \ - MatMulImpl(A, B, height, K, width, C); - - if (transpose_a) { - if (transpose_b) { - MATMUL_IMPL(kColMajor, kColMajor); - } else { - MATMUL_IMPL(kColMajor, kRowMajor); - } - } else { - if (transpose_b) { - MATMUL_IMPL(kRowMajor, kColMajor); - } else { - MATMUL_IMPL(kRowMajor, kRowMajor); - } - } + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL -#undef MATMUL_IMPL - return MACE_SUCCESS; - } -}; +void RegisterMatMul(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::CPU, uint8_t); #ifdef MACE_ENABLE_OPENCL -class OpenCLMatMulKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel); -}; -template -struct MatMulFunctor : OpKernel { - explicit MatMulFunctor(OpKernelContext *context); - - MaceStatus operator()(const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b, - StatsFuture *future); + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, float); - std::unique_ptr kernel_; -}; + MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, + DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL +} } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_MATMUL_H_ diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc deleted file mode 100644 index 14c014ba6894ac6407c989337308dff1757a6b1c..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/activation.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/activation.h" - -#include "mace/kernels/opencl/image/activation.h" - -namespace mace { -namespace kernels { - -template -ActivationFunctor::ActivationFunctor( - OpKernelContext *context, - ActivationType type, - T relux_max_limit) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset( - new opencl::image::ActivationKernel(type, relux_max_limit)); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus ActivationFunctor::operator()( - const Tensor *input, - const Tensor *alpha, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, alpha, output, future); -} - -template struct ActivationFunctor; -template struct ActivationFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/activation.h b/mace/kernels/opencl/activation.h new file mode 100644 index 0000000000000000000000000000000000000000..35f1785cf9f3c330570fbffb2c52ab71fbb880e2 --- /dev/null +++ b/mace/kernels/opencl/activation.h @@ -0,0 +1,40 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_ACTIVATION_H_ +#define MACE_KERNELS_OPENCL_ACTIVATION_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLActivationKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *alpha, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLActivationKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_ACTIVATION_H_ diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc deleted file mode 100644 index af3d18d5453056c53be268799b73a976aa0c1083..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/addn.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/addn.h" - -#include "mace/kernels/opencl/image/addn.h" - -namespace mace { -namespace kernels { - -template -AddNFunctor::AddNFunctor(OpKernelContext *context) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset( - new opencl::image::AddNKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus AddNFunctor::operator()( - const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) { - return kernel_->Compute(context_, input_tensors, output_tensor, future); -} - -template struct AddNFunctor; -template struct AddNFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/addn.h b/mace/kernels/opencl/addn.h new file mode 100644 index 0000000000000000000000000000000000000000..908ff11328f030a051d68024db477bf3d0072ef4 --- /dev/null +++ b/mace/kernels/opencl/addn.h @@ -0,0 +1,42 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_ADDN_H_ +#define MACE_KERNELS_OPENCL_ADDN_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { + +class OpenCLAddNKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const std::vector &input_tensors, + Tensor *output_tensor) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLAddNKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_ADDN_H_ diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc deleted file mode 100644 index c09f8eb23b53c9b1a474a6b30214d6045dfa08ff..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/batch_norm.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/batch_norm.h" -#include "mace/kernels/opencl/image/batch_norm.h" - -namespace mace { -namespace kernels { - -template -BatchNormFunctor::BatchNormFunctor( - OpKernelContext *context, - const bool folded_constant, - const ActivationType activation, - const float relux_max_limit) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BatchNormKernel( - folded_constant, activation, relux_max_limit)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, scale, offset, mean, - var, epsilon, output, future); -} - -template struct BatchNormFunctor; -template struct BatchNormFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/batch_norm.h b/mace/kernels/opencl/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..b97dfe6ecc135fb8336548c0795b5a3ea6ce0dd8 --- /dev/null +++ b/mace/kernels/opencl/batch_norm.h @@ -0,0 +1,43 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BATCH_NORM_H_ +#define MACE_KERNELS_OPENCL_BATCH_NORM_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLBatchNormKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchNormKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BATCH_NORM_H_ diff --git a/mace/kernels/opencl/batch_to_space.cc b/mace/kernels/opencl/batch_to_space.cc deleted file mode 100644 index 7fe533ebe168a01d6d202410b781efe1de51feac..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/batch_to_space.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ -#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ - -#include "mace/kernels/batch_to_space.h" -#include "mace/kernels/opencl/image/batch_to_space.h" - -namespace mace { -namespace kernels { - -template -BatchToSpaceFunctor::BatchToSpaceFunctor( - OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : BatchToSpaceFunctorBase(context, paddings, block_shape) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BatchToSpaceKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus BatchToSpaceFunctor::operator()( - const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) { - std::vector output_shape(4, 0); - CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC, - output_shape.data()); - return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_, - output_shape, space_tensor, future); -} - -template struct BatchToSpaceFunctor; -template struct BatchToSpaceFunctor; - -} // namespace kernels -} // namespace mace -#endif // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/reshape.h b/mace/kernels/opencl/batch_to_space.h similarity index 52% rename from mace/kernels/reshape.h rename to mace/kernels/opencl/batch_to_space.h index f0ab1bf583b226950a9382e3d5b7a78dfa388c0b..9f155336e0df92e365086dca3d9646ec96096ac0 100644 --- a/mace/kernels/reshape.h +++ b/mace/kernels/opencl/batch_to_space.h @@ -12,35 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_RESHAPE_H_ -#define MACE_KERNELS_RESHAPE_H_ +#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ +#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" +#include "mace/core/types.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" namespace mace { -namespace kernels { - -template -struct ReshapeFunctor : OpKernel { - explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - const std::vector &out_shape, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - output->ReuseTensorBuffer(*input); - output->Reshape(out_shape); +class OpContext; +class Tensor; - return MACE_SUCCESS; - } +namespace kernels { +class OpenCLBatchToSpaceKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *batch_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *space_tensor) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchToSpaceKernel); }; } // namespace kernels } // namespace mace -#endif // MACE_KERNELS_RESHAPE_H_ +#endif // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_ diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc deleted file mode 100644 index 6904eed9d113693b55578485d8d9e7d80196b5a4..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/bias_add.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/bias_add.h" -#include "mace/kernels/opencl/image/bias_add.h" - -namespace mace { -namespace kernels { - -template -BiasAddFunctor::BiasAddFunctor( - OpKernelContext *context, - const DataFormat data_format) - : BiasAddFunctorBase(context, data_format) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BiasAddKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus BiasAddFunctor::operator()(const Tensor *input, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, - "gpu only support biasadd for 4-dimensional NHWC format tensor"); - return kernel_->Compute(context_, input, bias, output, future); -} - -template struct BiasAddFunctor; -template struct BiasAddFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/bias_add.h b/mace/kernels/opencl/bias_add.h new file mode 100644 index 0000000000000000000000000000000000000000..1a0a105056f119e011b6c3b49e687bacca1c255a --- /dev/null +++ b/mace/kernels/opencl/bias_add.h @@ -0,0 +1,40 @@ +// Copyright 2017 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BIAS_ADD_H_ +#define MACE_KERNELS_OPENCL_BIAS_ADD_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLBiasAddKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *bias, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBiasAddKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BIAS_ADD_H_ diff --git a/mace/kernels/opencl/buffer/buffer_inverse_transform.h b/mace/kernels/opencl/buffer/buffer_inverse_transform.h index 93bd22a9412b7d2fd48cd1297da32cc5c7a3e371..29e6314364e97ac25c576bef77d3561793e519ab 100644 --- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h +++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h @@ -15,7 +15,10 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ #define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ -#include "mace/kernels/buffer_inverse_transform.h" +#include "mace/kernels/opencl/buffer_inverse_transform.h" + +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -24,40 +27,37 @@ namespace opencl { namespace buffer { MaceStatus BufferTypeTransform( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future); + Tensor *output); template class BufferInverseTransform: public OpenCLBufferInverseTransformKernel { public: - MaceStatus Compute(OpKernelContext *context, + MaceStatus Compute(OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; }; template -MaceStatus BufferInverseTransform::Compute(OpKernelContext *context, +MaceStatus BufferInverseTransform::Compute(OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_UNUSED(type); MACE_UNUSED(wino_blk_size); const DataType dt = DataTypeToEnum::value; if (input->dtype() != output->dtype()) { - return BufferTypeTransform(context, &kernel_, input, dt, output, future); + return BufferTypeTransform(context, &kernel_, input, dt, output); } else { - SetFutureDefaultWaitFn(future); + SetFutureDefaultWaitFn(context->future()); output->ReuseTensorBuffer(*input); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/kernels/opencl/buffer/buffer_transform.cc b/mace/kernels/opencl/buffer/buffer_transform.cc index 73ee521cef20f273a55e71bfcb529e04eb122f5d..7e5897a5b640ff7afcece927f4a40a19016d4ff8 100644 --- a/mace/kernels/opencl/buffer/buffer_transform.cc +++ b/mace/kernels/opencl/buffer/buffer_transform.cc @@ -24,12 +24,11 @@ namespace opencl { namespace buffer { MaceStatus TransformConv2DFilter( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t out_chan = input->dim(0); const index_t in_chan = input->dim(1); const index_t filter_height = input->dim(2); @@ -90,20 +89,19 @@ MaceStatus TransformConv2DFilter( transformed_shape[3]); std::vector lws = {4, 4, 4, 0}; MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION // Mark the buffer unused. const_cast(input)->MarkUnused(); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus TransformDWConv2DFilter( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t multiplier = input->dim(0); const index_t in_chan = input->dim(1); const index_t filter_height = input->dim(2); @@ -159,20 +157,19 @@ MaceStatus TransformDWConv2DFilter( transformed_shape[3]); std::vector lws = {4, 4, 4, 0}; MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION // Mark the buffer unused. const_cast(input)->MarkUnused(); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus TransformArgument( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t size = input->dim(0); std::vector transformed_shape = {RoundUp(size, 4)}; @@ -225,8 +222,8 @@ MaceStatus TransformArgument( } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -235,7 +232,7 @@ MaceStatus TransformArgument( } // Mark the buffer unused. const_cast(input)->MarkUnused(); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace buffer diff --git a/mace/kernels/opencl/buffer/buffer_transform.h b/mace/kernels/opencl/buffer/buffer_transform.h index 4c56f316e70b10dfb4aaa2c62a76a778600c5ed1..4a2213e413b7b2b405e3d7611fad5fb6418e7bc2 100644 --- a/mace/kernels/opencl/buffer/buffer_transform.h +++ b/mace/kernels/opencl/buffer/buffer_transform.h @@ -15,9 +15,12 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ #define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_ +#include "mace/kernels/opencl/buffer_transform.h" + #include -#include "mace/kernels/buffer_transform.h" +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -26,48 +29,43 @@ namespace opencl { namespace buffer { MaceStatus BufferTypeTransform( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future); + Tensor *output); MaceStatus TransformConv2DFilter( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future); + Tensor *output); MaceStatus TransformDWConv2DFilter( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future); + Tensor *output); MaceStatus TransformArgument( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future); + Tensor *output); template class BufferTransform: public OpenCLBufferTransformKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -75,30 +73,26 @@ class BufferTransform: public OpenCLBufferTransformKernel { }; template -MaceStatus BufferTransform::Compute(OpKernelContext *context, +MaceStatus BufferTransform::Compute(OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_UNUSED(type); MACE_UNUSED(wino_blk_size); const DataType dt = DataTypeToEnum::value; switch (type) { case CONV2D_FILTER: - return TransformConv2DFilter(context, &kernel_, input, - dt, output, future); + return TransformConv2DFilter(context, &kernel_, input, dt, output); case DW_CONV2D_FILTER: - return TransformDWConv2DFilter(context, &kernel_, input, - dt, output, future); + return TransformDWConv2DFilter(context, &kernel_, input, dt, output); case ARGUMENT: - return TransformArgument(context, &kernel_, input, dt, output, future); + return TransformArgument(context, &kernel_, input, dt, output); default: if (input->dtype() != dt) { - return BufferTypeTransform(context, &kernel_, input, - dt, output, future); + return BufferTypeTransform(context, &kernel_, input, dt, output); } else { - SetFutureDefaultWaitFn(future); + SetFutureDefaultWaitFn(context->future()); output->ReuseTensorBuffer(*input); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/kernels/opencl/buffer/buffer_type_transform.cc b/mace/kernels/opencl/buffer/buffer_type_transform.cc index 8de6d6df02b6c3111b64d6b65b817e1eae198010..4f78f83a1f9d119dd24069917c0a911fa3e87702 100644 --- a/mace/kernels/opencl/buffer/buffer_type_transform.cc +++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/conv_2d.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -25,12 +24,11 @@ namespace buffer { MaceStatus BufferTypeTransform( - OpKernelContext *context, + OpContext *context, cl::Kernel *kernel, const Tensor *input, const DataType dt, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_RETURN_IF_ERROR(output->ResizeLike(input)); auto runtime = context->device()->opencl_runtime(); @@ -80,8 +78,8 @@ MaceStatus BufferTypeTransform( } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -90,7 +88,7 @@ MaceStatus BufferTypeTransform( } // Mark the buffer unused. const_cast(input)->MarkUnused(); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace buffer diff --git a/mace/kernels/opencl/buffer/conv_2d.h b/mace/kernels/opencl/buffer/conv_2d.h index ba1983208547976dad0e472b2a250e35ee806745..8e7ee8b53b4381455ec60710479856bf7703da61 100644 --- a/mace/kernels/opencl/buffer/conv_2d.h +++ b/mace/kernels/opencl/buffer/conv_2d.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_ #define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_ -#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/conv_2d.h" #include #include @@ -29,7 +29,7 @@ namespace opencl { namespace buffer { namespace conv2d { -extern MaceStatus Conv2d1x1(OpKernelContext *context, +extern MaceStatus Conv2d1x1(OpContext *context, cl::Kernel *kernel, const Tensor *padded_input, const Tensor *filter, @@ -42,7 +42,7 @@ extern MaceStatus Conv2d1x1(OpKernelContext *context, Tensor *output, StatsFuture *future); -extern MaceStatus Conv2dGeneral(OpKernelContext *context, +extern MaceStatus Conv2dGeneral(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -63,7 +63,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { Conv2dKernel() : old_scratch_size_(0) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -73,8 +73,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: index_t old_scratch_size_; @@ -85,7 +84,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { template MaceStatus Conv2dKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -95,8 +94,7 @@ MaceStatus Conv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) { + Tensor *output) { StatsFuture pad_future, conv_future; index_t filter_h = filter->dim(2); index_t filter_w = filter->dim(3); @@ -206,7 +204,7 @@ MaceStatus Conv2dKernel::Compute( }; } MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output)); - MergeMultipleFutureWaitFn({pad_future, conv_future}, future); + MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future()); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/kernels/opencl/buffer/conv_2d_1x1.cc b/mace/kernels/opencl/buffer/conv_2d_1x1.cc index 97854cf47cadcc93069aea6bb72b2b9e747dd342..cbe12466a110612050365614fe09cb8a03a5dfe3 100644 --- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc +++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/activation.h" -#include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -24,7 +23,7 @@ namespace opencl { namespace buffer { namespace conv2d { -MaceStatus Conv2d1x1(OpKernelContext *context, +MaceStatus Conv2d1x1(OpContext *context, cl::Kernel *kernel, const Tensor *padded_input, const Tensor *filter, @@ -117,7 +116,7 @@ MaceStatus Conv2d1x1(OpKernelContext *context, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws, lws, future)); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace conv2d diff --git a/mace/kernels/opencl/buffer/conv_2d_general.cc b/mace/kernels/opencl/buffer/conv_2d_general.cc index 67feecdf79a383f4971f0448d7a71ac85c430650..17506a8b96288a1a92909a9239a977fdde7e1b72 100644 --- a/mace/kernels/opencl/buffer/conv_2d_general.cc +++ b/mace/kernels/opencl/buffer/conv_2d_general.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/activation.h" -#include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -24,7 +23,7 @@ namespace opencl { namespace buffer { namespace conv2d { -MaceStatus Conv2dGeneral(OpKernelContext *context, +MaceStatus Conv2dGeneral(OpContext *context, cl::Kernel *kernel, const Tensor *padded_input, const Tensor *filter, @@ -131,7 +130,7 @@ MaceStatus Conv2dGeneral(OpKernelContext *context, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws, lws, future)); MACE_OUT_OF_RANGE_VALIDATION - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace conv2d diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.cc b/mace/kernels/opencl/buffer/depthwise_conv2d.cc index bcd36bba706cd5affb40caf8c587b3577aac298e..9ff27690166630f1fca9de29f57dbda22108f4a6 100644 --- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc +++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc @@ -23,7 +23,7 @@ namespace opencl { namespace buffer { namespace depthwise { -MaceStatus DepthwiseConv2d(OpKernelContext *context, +MaceStatus DepthwiseConv2d(OpContext *context, cl::Kernel *kernel, const Tensor *padded_input, // NHWC const Tensor *filter, // HWIM @@ -127,7 +127,7 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context, gws, lws, future)); MACE_OUT_OF_RANGE_VALIDATION - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace depthwise diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.h b/mace/kernels/opencl/buffer/depthwise_conv2d.h index 23fddf0e3e7ac5eb4fa88ef22c548c677d1bfeba..b5e26c40de01175a807dacea50a7fe6f5a03377d 100644 --- a/mace/kernels/opencl/buffer/depthwise_conv2d.h +++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_ #define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_ -#include "mace/kernels/depthwise_conv2d.h" +#include "mace/kernels/opencl/depthwise_conv2d.h" #include #include @@ -29,7 +29,7 @@ namespace opencl { namespace buffer { namespace depthwise { -MaceStatus DepthwiseConv2d(OpKernelContext *context, +MaceStatus DepthwiseConv2d(OpContext *context, cl::Kernel *kernel, const Tensor *padded_input, // NHWC const Tensor *filter, // HWIM @@ -50,7 +50,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { public: DepthwiseConv2dKernel() : old_scratch_size_(0) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -60,8 +60,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: index_t old_scratch_size_; @@ -72,7 +71,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { template MaceStatus DepthwiseConv2dKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -82,8 +81,7 @@ MaceStatus DepthwiseConv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) { + Tensor *output) { StatsFuture pad_future, dw_conv_future; index_t filter_w = filter->dim(3); @@ -178,7 +176,7 @@ MaceStatus DepthwiseConv2dKernel::Compute( context, &kernels_[1], padded_input_ptr, filter, bias, strides, dilations, DataTypeToEnum::v(), activation, relux_max_limit, input_changed, output, &dw_conv_future)); - MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future); + MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future()); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/kernels/opencl/buffer/pooling.h b/mace/kernels/opencl/buffer/pooling.h index ef4ee4472f4a254768d49abd7839a453365587ee..a4433d13e0a1377de4b11c3f8254b643af39f4c7 100644 --- a/mace/kernels/opencl/buffer/pooling.h +++ b/mace/kernels/opencl/buffer/pooling.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_ #define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_ -#include "mace/kernels/pooling.h" +#include "mace/kernels/opencl/pooling.h" #include #include @@ -35,7 +35,7 @@ class PoolingKernel : public OpenCLPoolingKernel { public: PoolingKernel() : old_scratch_size_(0) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const PoolingType pooling_type, const int *kernels, @@ -43,8 +43,7 @@ class PoolingKernel : public OpenCLPoolingKernel { const Padding &padding_type, const std::vector &padding_data, const int *dilations, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: index_t old_scratch_size_; @@ -55,7 +54,7 @@ class PoolingKernel : public OpenCLPoolingKernel { template MaceStatus PoolingKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const PoolingType pooling_type, const int *kernels, @@ -63,8 +62,7 @@ MaceStatus PoolingKernel::Compute( const Padding &padding_type, const std::vector &padding_data, const int *dilations, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) << "Pooling opencl kernel not support dilation yet"; @@ -200,9 +198,9 @@ MaceStatus PoolingKernel::Compute( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, &pooling_future)); MACE_OUT_OF_RANGE_VALIDATION - MergeMultipleFutureWaitFn({pad_future, pooling_future}, future); + MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future()); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace buffer diff --git a/mace/kernels/opencl/buffer/softmax.h b/mace/kernels/opencl/buffer/softmax.h index 59bb8d26dd25bf7313a0882c348e62cdc68c2a57..502899d890ab2e28f64a36b4c98bdc94be722e04 100644 --- a/mace/kernels/opencl/buffer/softmax.h +++ b/mace/kernels/opencl/buffer/softmax.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_ #define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_ -#include "mace/kernels/softmax.h" +#include "mace/kernels/opencl/softmax.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -31,10 +33,9 @@ template class SoftmaxKernel : public OpenCLSoftmaxKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *logits, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -44,10 +45,9 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { template MaceStatus SoftmaxKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *logits, - Tensor *output, - StatsFuture *future) { + Tensor *output) { index_t batch = 0; index_t height = 0; index_t width = 0; @@ -112,9 +112,9 @@ MaceStatus SoftmaxKernel::Compute( std::string tuning_key = Concat("softmax_opencl_kernel", batch, height, width, channels); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace buffer diff --git a/mace/kernels/opencl/buffer/utils.cc b/mace/kernels/opencl/buffer/utils.cc index abc06ca8538fb66151b636bf913f036c22302905..a6d5502a4a79424d1228b0ceba73ef314656e1b0 100644 --- a/mace/kernels/opencl/buffer/utils.cc +++ b/mace/kernels/opencl/buffer/utils.cc @@ -26,7 +26,7 @@ namespace kernels { namespace opencl { namespace buffer { -MaceStatus PadInput(OpKernelContext *context, +MaceStatus PadInput(OpContext *context, cl::Kernel *kernel, const Tensor *input, const int pad_top, @@ -88,7 +88,7 @@ MaceStatus PadInput(OpKernelContext *context, MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws, lws, future)); MACE_OUT_OF_RANGE_VALIDATION - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace buffer diff --git a/mace/kernels/opencl/buffer/utils.h b/mace/kernels/opencl/buffer/utils.h index f19a8210da90a7e0303bdf53d7e0d1687f920210..f783691fed108c08cad167689f4ed15e63abd420 100644 --- a/mace/kernels/opencl/buffer/utils.h +++ b/mace/kernels/opencl/buffer/utils.h @@ -16,7 +16,7 @@ #define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_ #include "mace/core/future.h" -#include "mace/core/op_kernel_context.h" +#include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/public/mace.h" @@ -25,7 +25,7 @@ namespace kernels { namespace opencl { namespace buffer { -MaceStatus PadInput(OpKernelContext *context, +MaceStatus PadInput(OpContext *context, cl::Kernel *kernel, const Tensor *input, const int pad_top, diff --git a/mace/kernels/opencl/buffer_inverse_transform.cc b/mace/kernels/opencl/buffer_inverse_transform.cc deleted file mode 100644 index 352fbed7cdf1193426ce2ad6c8a414fd126456b1..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/buffer_inverse_transform.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/buffer_inverse_transform.h" -#include "mace/kernels/opencl/image/image_to_buffer.h" -#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h" - -namespace mace { -namespace kernels { - -template -BufferInverseTransformFunctor< - DeviceType::GPU, T>::BufferInverseTransformFunctor( - OpKernelContext *context, - const int wino_blk_size) - : BufferInverseTransformFunctorBase(context, wino_blk_size) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ImageToBuffer); - } else { - kernel_.reset(new opencl::buffer::BufferInverseTransform); - } -} - -template -MaceStatus BufferInverseTransformFunctor::operator()( - const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, type, - wino_blk_size_, output, future); -} - -template struct BufferInverseTransformFunctor; -template struct BufferInverseTransformFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/buffer_inverse_transform.h b/mace/kernels/opencl/buffer_inverse_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..0c7859108e89cb5324d63dc630ea22fd5149c220 --- /dev/null +++ b/mace/kernels/opencl/buffer_inverse_transform.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ + +#include "mace/kernels/opencl/common.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLBufferInverseTransformKernel { + public: + virtual MaceStatus Compute(OpContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel) +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/kernels/opencl/buffer_transform.cc b/mace/kernels/opencl/buffer_transform.cc deleted file mode 100644 index 55854753af8bbb96aae9fb9e9582a2ea57afa56d..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/buffer_transform.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/buffer_transform.h" -#include "mace/kernels/opencl/image/buffer_to_image.h" -#include "mace/kernels/opencl/buffer/buffer_transform.h" - -namespace mace { -namespace kernels { - -template -BufferTransformFunctor::BufferTransformFunctor( - OpKernelContext *context, - const int wino_blk_size) - : BufferTransformFunctorBase(context, wino_blk_size) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::BufferToImage); - } else { - kernel_.reset(new opencl::buffer::BufferTransform); - } -} - -template -MaceStatus BufferTransformFunctor::operator()( - const Tensor *input, - const BufferType type, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, type, - wino_blk_size_, output, future); -} - -template struct BufferTransformFunctor; -template struct BufferTransformFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/buffer_transform.h b/mace/kernels/opencl/buffer_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..cc53ef77b96aa57b16779c439a51df35ea0877d6 --- /dev/null +++ b/mace/kernels/opencl/buffer_transform.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_ + +#include "mace/kernels/opencl/common.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLBufferTransformKernel { + public: + virtual MaceStatus Compute(OpContext *context, + const Tensor *input, + const BufferType type, + const int wino_blk_size, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_ diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc deleted file mode 100644 index 7d8365038250265a43bc09569d8710cc0cabf75a..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/channel_shuffle.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/channel_shuffle.h" -#include "mace/kernels/opencl/image/channel_shuffle.h" - -namespace mace { -namespace kernels { - -template -ChannelShuffleFunctor::ChannelShuffleFunctor( - OpKernelContext *context, - const int groups) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ChannelShuffleKernel(groups)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus ChannelShuffleFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct ChannelShuffleFunctor; -template struct ChannelShuffleFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/channel_shuffle.h b/mace/kernels/opencl/channel_shuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..5a5da02758498bf24c8800a1bc91f62e37ec71d1 --- /dev/null +++ b/mace/kernels/opencl/channel_shuffle.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_ +#define MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLChannelShuffleKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLChannelShuffleKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_ diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc deleted file mode 100644 index 12ba334f241965371e69e23102ab23f91c9de20d..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/concat.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/concat.h" -#include "mace/kernels/opencl/image/concat.h" - -namespace mace { -namespace kernels { - -template -ConcatFunctor::ConcatFunctor( - OpKernelContext *context, - const int32_t axis) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ConcatKernel(axis)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus ConcatFunctor::operator()( - const std::vector &input_list, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input_list, output, future); -} - -template struct ConcatFunctor; -template struct ConcatFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/concat.h b/mace/kernels/opencl/concat.h new file mode 100644 index 0000000000000000000000000000000000000000..78ef14d9c453642c3228e59020b35e91daa175c3 --- /dev/null +++ b/mace/kernels/opencl/concat.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_CONCAT_H_ +#define MACE_KERNELS_OPENCL_CONCAT_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLConcatKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const std::vector &input_list, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConcatKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_CONCAT_H_ diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc deleted file mode 100644 index 38bb2e8f5733949cd163103b489f56478d437619..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/conv_2d.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/conv_2d.h" -#include "mace/kernels/opencl/image/conv_2d.h" -#include "mace/kernels/opencl/buffer/conv_2d.h" - -namespace mace { -namespace kernels { - -template -Conv2dFunctor::Conv2dFunctor( - OpKernelContext *context, - const int *strides, - const Padding &padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : Conv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::Conv2dKernel); - } else { - kernel_.reset(new opencl::buffer::Conv2dKernel); - } -} - -template -MaceStatus Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - // Compute - return kernel_->Compute(context_, input, filter, bias, - strides_, padding_type_, paddings_, - dilations_, activation_, relux_max_limit_, - output, future); -} - -template struct Conv2dFunctor; -template struct Conv2dFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/conv_2d.h b/mace/kernels/opencl/conv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..d5ff40edd7d29a54e43de892693e6d01cf4e6989 --- /dev/null +++ b/mace/kernels/opencl/conv_2d.h @@ -0,0 +1,47 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_CONV_2D_H_ +#define MACE_KERNELS_OPENCL_CONV_2D_H_ + +#include + +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_pool_2d_util.h" + +namespace mace { +class OpContext; + +namespace kernels { +class OpenCLConv2dKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_CONV_2D_H_ diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc deleted file mode 100644 index 720b2c8cdb49bd2ea664b7e4fe1cae516527189e..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/crop.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/crop.h" -#include "mace/kernels/opencl/image/crop.h" - -namespace mace { -namespace kernels { - -template -CropFunctor::CropFunctor(OpKernelContext *context, - const int axis, - const std::vector &offset) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::CropKernel(axis, offset)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus CropFunctor::operator()( - const std::vector &input_list, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input_list, output, future); -} - -template struct CropFunctor; -template struct CropFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/crop.h b/mace/kernels/opencl/crop.h new file mode 100644 index 0000000000000000000000000000000000000000..d59f67f59648dccc05066908c68f15147dc6ebd9 --- /dev/null +++ b/mace/kernels/opencl/crop.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_CROP_H_ +#define MACE_KERNELS_OPENCL_CROP_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLCropKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const std::vector &input_list, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLCropKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_CROP_H_ diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc deleted file mode 100644 index e449a2ef29f70a053e479dc7f844890f2a63ca9c..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/deconv_2d.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/deconv_2d.h" -#include "mace/kernels/opencl/image/deconv_2d.h" - -namespace mace { -namespace kernels { - -template -Deconv2dFunctor::Deconv2dFunctor( - OpKernelContext *context, - const std::vector &strides, - const Padding &padding_type, - const std::vector &paddings, - const FrameworkType model_type, - const ActivationType activation, - const float relux_max_limit) - : Deconv2dFunctorBase(context, - strides, - padding_type, - paddings, - model_type, - activation, - relux_max_limit) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::Deconv2dKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus Deconv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const Tensor *output_shape_tensor, - Tensor *output, - StatsFuture *future) { - MACE_CHECK_NOTNULL(input); - MACE_CHECK_NOTNULL(filter); - MACE_CHECK_NOTNULL(output); - std::vector paddings(2); - std::vector out_paddings(2); - std::vector output_shape(4); - if (model_type_ == FrameworkType::TENSORFLOW) { - paddings = std::vector(2, 0); - MACE_CHECK_NOTNULL(output_shape_tensor); - MACE_CHECK(output_shape_tensor->size() == 4); - Tensor::MappingGuard output_shape_mapper(output_shape_tensor); - auto output_shape_data = - output_shape_tensor->data(); - output_shape = - std::vector(output_shape_data, output_shape_data + 4); - CalcDeconvPaddingAndInputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - padding_type_, - output_shape.data(), - paddings.data()); - } else { - out_paddings = paddings_; - paddings = std::vector(2, 0); - output_shape = std::vector(4, 0); - CalcDeconvOutputSize(input->shape().data(), - filter->shape().data(), - strides_.data(), - output_shape.data(), - out_paddings.data(), - paddings.data()); - } - - return kernel_->Compute(context_, input, filter, bias, - strides_.data(), paddings.data(), activation_, - relux_max_limit_, output_shape, output, future); -} - -template struct Deconv2dFunctor; -template struct Deconv2dFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/deconv_2d.h b/mace/kernels/opencl/deconv_2d.h new file mode 100644 index 0000000000000000000000000000000000000000..c601acfe3b5cd12cc6a61e77c939aaac0ae81767 --- /dev/null +++ b/mace/kernels/opencl/deconv_2d.h @@ -0,0 +1,46 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_DECONV_2D_H_ +#define MACE_KERNELS_OPENCL_DECONV_2D_H_ + +#include + +#include "mace/kernels/activation.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLDeconv2dKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const int *padding_data, + const ActivationType activation, + const float relux_max_limit, + const std::vector &output_shape, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDeconv2dKernel); +}; +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_DECONV_2D_H_ diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc deleted file mode 100644 index 2ab670d771266d2f7dc7d6a6e64587c074bce6ae..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/depth_to_space.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/depth_to_space.h" -#include "mace/kernels/opencl/image/depth_to_space.h" - -namespace mace { -namespace kernels { -template -DepthToSpaceOpFunctor::DepthToSpaceOpFunctor( - OpKernelContext *context, - const int block_size) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::DepthToSpaceKernel(block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus DepthToSpaceOpFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct DepthToSpaceOpFunctor; -template struct DepthToSpaceOpFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/depth_to_space.h b/mace/kernels/opencl/depth_to_space.h new file mode 100644 index 0000000000000000000000000000000000000000..02585911a501dd522bf129268974132e15f77a67 --- /dev/null +++ b/mace/kernels/opencl/depth_to_space.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_ +#define MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { + +class OpenCLDepthToSpaceKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthToSpaceKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_ diff --git a/mace/kernels/opencl/depthwise_conv2d.cc b/mace/kernels/opencl/depthwise_conv2d.cc deleted file mode 100644 index 29f028764a64404de7ab6691d426aa54c7b1672f..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/depthwise_conv2d.cc +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/depthwise_conv2d.h" -#include "mace/kernels/opencl/buffer/depthwise_conv2d.h" -#include "mace/kernels/opencl/image/depthwise_conv2d.h" - -namespace mace { -namespace kernels { -template -DepthwiseConv2dFunctor::DepthwiseConv2dFunctor( - OpKernelContext *context, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations, - const ActivationType activation, - const float relux_max_limit) - : DepthwiseConv2dFunctorBase(context, - strides, - padding_type, - paddings, - dilations, - activation, - relux_max_limit) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::DepthwiseConv2dKernel); - } else { - kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); - } -} - -template -MaceStatus DepthwiseConv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, /* MIHW */ - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, filter, bias, - strides_, padding_type_, paddings_, - dilations_, activation_, relux_max_limit_, - output, future); -} - -template struct DepthwiseConv2dFunctor; -template struct DepthwiseConv2dFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/depthwise_conv2d.h b/mace/kernels/opencl/depthwise_conv2d.h new file mode 100644 index 0000000000000000000000000000000000000000..24d08a208bc11fd26e522a6afe767b411fa76945 --- /dev/null +++ b/mace/kernels/opencl/depthwise_conv2d.h @@ -0,0 +1,48 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_ +#define MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_ + +#include + +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_pool_2d_util.h" + +namespace mace { + +class OpContext; + +namespace kernels { +class OpenCLDepthwiseConv2dKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseConv2dKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_ diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc deleted file mode 100644 index e43e21987dbf9d8dfd79f836b31e8692fea4fe3b..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/eltwise.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/eltwise.h" -#include "mace/kernels/opencl/image/eltwise.h" - -namespace mace { -namespace kernels { -template -EltwiseFunctor::EltwiseFunctor( - OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index, - const DataFormat data_format) : OpKernel(context) { - MACE_UNUSED(data_format); - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::EltwiseKernel( - type, coeff, scalar_input, scalar_input_index)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus EltwiseFunctor::operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input0, input1, output, future); -} - -template struct EltwiseFunctor; -template struct EltwiseFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/eltwise.h b/mace/kernels/opencl/eltwise.h new file mode 100644 index 0000000000000000000000000000000000000000..83a94feb014d11033f625e80dc4db5367f0d4592 --- /dev/null +++ b/mace/kernels/opencl/eltwise.h @@ -0,0 +1,40 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_ELTWISE_H_ +#define MACE_KERNELS_OPENCL_ELTWISE_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLEltwiseKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLEltwiseKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_ELTWISE_H_ diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc deleted file mode 100644 index 3dd0db4c622f05181c8bd538a54a566df7072189..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/fully_connected.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/fully_connected.h" -#include "mace/kernels/opencl/image/fully_connected.h" - -namespace mace { -namespace kernels { - -template -FullyConnectedFunctor::FullyConnectedFunctor( - OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit) - : FullyConnectedBase(context, activation, relux_max_limit) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::FullyConnectedKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus FullyConnectedFunctor::operator()( - const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute( - context_, input, weight, bias, activation_, relux_max_limit_, - output, future); -} - -template struct FullyConnectedFunctor; - -template struct FullyConnectedFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/fully_connected.h b/mace/kernels/opencl/fully_connected.h new file mode 100644 index 0000000000000000000000000000000000000000..7982d4685d6ff7d8b1fe27a6c896117b651cb4aa --- /dev/null +++ b/mace/kernels/opencl/fully_connected.h @@ -0,0 +1,45 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_ +#define MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_ + +#include "mace/kernels/activation.h" + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLFullyConnectedKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *weight, + const Tensor *bias, + const ActivationType activation, + const float relux_max_limit, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLFullyConnectedKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_ diff --git a/mace/kernels/opencl/image/activation.h b/mace/kernels/opencl/image/activation.h index 5ddf00ac24d14953cd56a75a3e097c95adee0869..b16330763c255374e9c4e307cb3a838885d746e9 100644 --- a/mace/kernels/opencl/image/activation.h +++ b/mace/kernels/opencl/image/activation.h @@ -14,13 +14,16 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_ #define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_ -#include "mace/kernels/activation.h" +#include "mace/kernels/opencl/activation.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -36,11 +39,10 @@ class ActivationKernel : public OpenCLActivationKernel { : activation_(type), relux_max_limit_(relux_max_limit) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *alpha, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: ActivationType activation_; @@ -53,11 +55,10 @@ class ActivationKernel : public OpenCLActivationKernel { template MaceStatus ActivationKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *alpha, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t height = input->dim(1); const index_t width = input->dim(2); @@ -133,10 +134,10 @@ MaceStatus ActivationKernel::Compute( Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/addn.h b/mace/kernels/opencl/image/addn.h index 49721c0938d3427e44db708ad4f6656645ea13ba..8f50d140602ecceb8b117378d6a2fdd72a62e7b1 100644 --- a/mace/kernels/opencl/image/addn.h +++ b/mace/kernels/opencl/image/addn.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_ #define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_ -#include "mace/kernels/addn.h" +#include "mace/kernels/opencl/addn.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,10 +34,9 @@ template class AddNKernel : public OpenCLAddNKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) override; + Tensor *output_tensor) override; private: cl::Kernel kernel_; @@ -45,10 +46,9 @@ class AddNKernel : public OpenCLAddNKernel { template MaceStatus AddNKernel::Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_tensors, - Tensor *output_tensor, - StatsFuture *future) { + Tensor *output_tensor) { size_t size = input_tensors.size(); MACE_CHECK(size >= 2 && input_tensors[0] != nullptr); @@ -122,9 +122,9 @@ MaceStatus AddNKernel::Compute( Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/batch_norm.h b/mace/kernels/opencl/image/batch_norm.h index 7b16015473ae4e374fb57362da729b7033549a08..9414f28b100bf933716f0272357360ad57d6d36c 100644 --- a/mace/kernels/opencl/image/batch_norm.h +++ b/mace/kernels/opencl/image/batch_norm.h @@ -14,13 +14,16 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_ #define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_ -#include "mace/kernels/batch_norm.h" +#include "mace/kernels/opencl/batch_norm.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,21 +35,19 @@ template class BatchNormKernel : public OpenCLBatchNormKernel { public: BatchNormKernel( - const bool folded_constant, + const float epsilon, const ActivationType activation, const float relux_max_limit); - MaceStatus Compute(OpKernelContext *context, + MaceStatus Compute(OpContext *context, const Tensor *input, const Tensor *scale, const Tensor *offset, const Tensor *mean, const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: - const bool folded_constant_; + const float epsilon_; const ActivationType activation_; const float relux_max_limit_; cl::Kernel kernel_; @@ -55,25 +56,23 @@ class BatchNormKernel : public OpenCLBatchNormKernel { }; template -BatchNormKernel::BatchNormKernel(const bool folded_constant, +BatchNormKernel::BatchNormKernel(const float epsilon, const ActivationType activation, const float relux_max_limit) - : folded_constant_(folded_constant), + : epsilon_(epsilon), activation_(activation), relux_max_limit_(relux_max_limit) {} template MaceStatus BatchNormKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *scale, const Tensor *offset, const Tensor *mean, const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) { - MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr)); + Tensor *output) { + bool not_folded = (mean != nullptr && var != nullptr); const index_t batch = input->dim(0); const index_t height = input->dim(1); @@ -98,7 +97,7 @@ MaceStatus BatchNormKernel::Compute( built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - if (folded_constant_) { + if (!not_folded) { built_options.emplace("-DFOLDED_CONSTANT"); } switch (activation_) { @@ -134,10 +133,10 @@ MaceStatus BatchNormKernel::Compute( kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(offset->opencl_image())); - if (!folded_constant_) { + if (not_folded) { kernel_.setArg(idx++, *(mean->opencl_image())); kernel_.setArg(idx++, *(var->opencl_image())); - kernel_.setArg(idx++, epsilon); + kernel_.setArg(idx++, epsilon_); } kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, relux_max_limit_); @@ -148,11 +147,11 @@ MaceStatus BatchNormKernel::Compute( const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); std::string tuning_key = Concat("batch_norm_opencl_kernel", activation_, output->dim(0), - output->dim(1), output->dim(2), output->dim(3), folded_constant_); + output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/batch_to_space.h b/mace/kernels/opencl/image/batch_to_space.h index f3c4bf8cf7beabf9141f8b4b9d1cea4ff700297d..8d9842707a8f425e547945815d0f845adb27732b 100644 --- a/mace/kernels/opencl/image/batch_to_space.h +++ b/mace/kernels/opencl/image/batch_to_space.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_ #define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_ -#include "mace/kernels/batch_to_space.h" +#include "mace/kernels/opencl/batch_to_space.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,13 +34,12 @@ template class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *batch_tensor, const std::vector &paddings, const std::vector &block_shape, const std::vector &output_shape, - Tensor *space_tensor, - StatsFuture *future) override; + Tensor *space_tensor) override; private: cl::Kernel kernel_; @@ -48,13 +49,12 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel { template MaceStatus BatchToSpaceKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *batch_tensor, const std::vector &paddings, const std::vector &block_shape, const std::vector &output_shape, - Tensor *space_tensor, - StatsFuture *future) { + Tensor *space_tensor) { std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &output_image_shape); @@ -116,10 +116,10 @@ MaceStatus BatchToSpaceKernel::Compute( Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1), batch_tensor->dim(2), batch_tensor->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/bias_add.h b/mace/kernels/opencl/image/bias_add.h index 3a84cbceefe6fdcce5d4573a5e8d42c5c3108992..2180df11e49d11c3a0fd2bce213d000664dd56c4 100644 --- a/mace/kernels/opencl/image/bias_add.h +++ b/mace/kernels/opencl/image/bias_add.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_ #define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_ -#include "mace/kernels/bias_add.h" +#include "mace/kernels/opencl/bias_add.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,11 +34,10 @@ template class BiasAddKernel : public OpenCLBiasAddKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *bias, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -46,11 +47,10 @@ class BiasAddKernel : public OpenCLBiasAddKernel { template MaceStatus BiasAddKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *bias, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t height = input->dim(1); const index_t width = input->dim(2); @@ -111,8 +111,8 @@ MaceStatus BiasAddKernel::Compute( } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -120,7 +120,7 @@ MaceStatus BiasAddKernel::Compute( }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/buffer_to_image.h b/mace/kernels/opencl/image/buffer_to_image.h index a791c064d4b5242aa3d0b58be6fc7bdcbcaf44fe..208c33faa327fc9ffe6e681ab56e207a8be0b42b 100644 --- a/mace/kernels/opencl/image/buffer_to_image.h +++ b/mace/kernels/opencl/image/buffer_to_image.h @@ -15,11 +15,14 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ #define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_ +#include "mace/kernels/opencl/buffer_transform.h" + #include #include #include -#include "mace/kernels/buffer_transform.h" +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -31,12 +34,11 @@ template class BufferToImage : public OpenCLBufferTransformKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -45,12 +47,11 @@ class BufferToImage : public OpenCLBufferTransformKernel { template MaceStatus BufferToImage::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) { + Tensor *output) { auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); @@ -186,8 +187,8 @@ MaceStatus BufferToImage::Compute( } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -198,7 +199,7 @@ MaceStatus BufferToImage::Compute( // Mark the buffer unused. const_cast(input)->MarkUnused(); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/channel_shuffle.h b/mace/kernels/opencl/image/channel_shuffle.h index 5034f56917307984025b7d5d6d08fcb9facb391b..8d351c0ad2464062a711b6b58e9da5421138c66e 100644 --- a/mace/kernels/opencl/image/channel_shuffle.h +++ b/mace/kernels/opencl/image/channel_shuffle.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_ #define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_ -#include "mace/kernels/channel_shuffle.h" +#include "mace/kernels/opencl/channel_shuffle.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -33,10 +35,9 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { public: explicit ChannelShuffleKernel(const int groups) : groups_(groups) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: const int groups_; @@ -47,10 +48,12 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel { template MaceStatus ChannelShuffleKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { + MACE_CHECK(input->dim(3) % groups_ == 0, + "input channels must be an integral multiple of group. ", + input->dim(3)); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); const index_t batch = input->dim(0); @@ -105,9 +108,9 @@ MaceStatus ChannelShuffleKernel::Compute( Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/concat.cc b/mace/kernels/opencl/image/concat.cc index 60144d558107418bd27a55a49b2f629fb7c64712..9fc6dd486af0114dc1e0e59c480b752d63263ff2 100644 --- a/mace/kernels/opencl/image/concat.cc +++ b/mace/kernels/opencl/image/concat.cc @@ -46,14 +46,13 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -MaceStatus Concat2(OpKernelContext *context, +MaceStatus Concat2(OpContext *context, cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -112,17 +111,16 @@ MaceStatus Concat2(OpKernelContext *context, Concat("concat_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } -MaceStatus ConcatN(OpKernelContext *context, +MaceStatus ConcatN(OpContext *context, cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -185,7 +183,7 @@ MaceStatus ConcatN(OpKernelContext *context, } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr && runtime->is_profiling_enabled()) { + if (context->future() != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; runtime->GetCallStats(event, &tmp_stats); @@ -194,8 +192,8 @@ MaceStatus ConcatN(OpKernelContext *context, call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; } } - if (future != nullptr) { - future->wait_fn = [call_stats](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [call_stats](CallStats *stats) { if (stats != nullptr) { stats->start_micros = call_stats.start_micros; stats->end_micros = stats->start_micros + call_stats.end_micros; @@ -203,7 +201,7 @@ MaceStatus ConcatN(OpKernelContext *context, }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace concat diff --git a/mace/kernels/opencl/image/concat.h b/mace/kernels/opencl/image/concat.h index 6289a000f78d253ba864bfb9918baa7f398ecd47..4041cc3e3bf772a84f901837c12b7ab0844660a2 100644 --- a/mace/kernels/opencl/image/concat.h +++ b/mace/kernels/opencl/image/concat.h @@ -14,11 +14,13 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_ #define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_ -#include "mace/kernels/concat.h" +#include "mace/kernels/opencl/concat.h" #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -26,22 +28,20 @@ namespace kernels { namespace opencl { namespace image { namespace concat { -MaceStatus Concat2(OpKernelContext *context, +MaceStatus Concat2(OpContext *context, cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); -MaceStatus ConcatN(OpKernelContext *context, +MaceStatus ConcatN(OpContext *context, cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); } // namespace concat @@ -50,10 +50,9 @@ class ConcatKernel : public OpenCLConcatKernel { public: explicit ConcatKernel(const int32_t axis) : axis_(axis) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_list, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: int32_t axis_; @@ -64,10 +63,9 @@ class ConcatKernel : public OpenCLConcatKernel { template MaceStatus ConcatKernel::Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_list, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const int inputs_count = input_list.size(); MACE_CHECK(inputs_count >= 2 && axis_ == 3) << "Concat opencl kernel only support >=2 elements with axis == 3"; @@ -101,18 +99,17 @@ MaceStatus ConcatKernel::Compute( case 2: return concat::Concat2( context, &kernel_, input_list[0], input_list[1], - DataTypeToEnum::value, &input_shape_, output, future, &kwg_size_); + DataTypeToEnum::value, &input_shape_, output, &kwg_size_); default: if (divisible_four) { return concat::ConcatN(context, &kernel_, input_list, - DataTypeToEnum::value, output, future, - &kwg_size_); + DataTypeToEnum::value, output, &kwg_size_); } else { MACE_NOT_IMPLEMENTED; } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/conv_2d.h b/mace/kernels/opencl/image/conv_2d.h index 05ee6a0750d129df06d97ce2bbf4f8aa9d70cf7e..415beac4610b26f4cb88b0032715d1e61bb54104 100644 --- a/mace/kernels/opencl/image/conv_2d.h +++ b/mace/kernels/opencl/image/conv_2d.h @@ -14,11 +14,13 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_ #define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_ -#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/conv_2d.h" #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -26,7 +28,7 @@ namespace kernels { namespace opencl { namespace image { -extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, +extern MaceStatus Conv2dOpenclK1x1(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -39,10 +41,9 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); -extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, +extern MaceStatus Conv2dOpenclK3x3(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -55,10 +56,9 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); -extern MaceStatus Conv2dOpencl(OpKernelContext *context, +extern MaceStatus Conv2dOpencl(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -71,7 +71,6 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); @@ -79,7 +78,7 @@ template class Conv2dKernel : public OpenCLConv2dKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -89,8 +88,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -100,7 +98,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { template MaceStatus Conv2dKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -110,15 +108,14 @@ MaceStatus Conv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) { + Tensor *output) { typedef MaceStatus (*Conv2dOpenclFunction)( - OpKernelContext *context, - cl::Kernel * kernel, const Tensor *input, const Tensor *filter, + OpContext *context, + cl::Kernel *kernel, const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, const int *padding, const int *dilations, const ActivationType activation, const float relux_max_limit, const DataType dt, - std::vector *input_shape, Tensor *output, StatsFuture *future, + std::vector *input_shape, Tensor *output, uint32_t *kwg_size); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[3] = { @@ -161,13 +158,13 @@ MaceStatus Conv2dKernel::Compute( return conv2d_func(context, &kernel_, input, filter, bias, strides[0], paddings.data(), dilations, activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, - output, future, &kwg_size_); + output, &kwg_size_); } else { return Conv2dOpencl( context, &kernel_, input, filter, bias, strides[0], paddings.data(), dilations, activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, - output, future, &kwg_size_); + output, &kwg_size_); } } diff --git a/mace/kernels/opencl/image/conv_2d_1x1.cc b/mace/kernels/opencl/image/conv_2d_1x1.cc index 2460afe5a2cd16510aee171a6f1447a7bacaa95d..36f8ba3442001074f601d8d668b3ca24a5b9e67a 100644 --- a/mace/kernels/opencl/image/conv_2d_1x1.cc +++ b/mace/kernels/opencl/image/conv_2d_1x1.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/conv_2d.h" +#include "mace/kernels/activation.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -66,7 +66,7 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, +extern MaceStatus Conv2dOpenclK1x1(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -79,7 +79,6 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { MACE_UNUSED(padding); MACE_UNUSED(dilations); @@ -170,9 +169,9 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context, Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/conv_2d_3x3.cc b/mace/kernels/opencl/image/conv_2d_3x3.cc index 900cd6102c50ea5a1704901189f8124d726e0fdf..f2f94c03beefc4537756cd9aa8e44aa226b774d8 100644 --- a/mace/kernels/opencl/image/conv_2d_3x3.cc +++ b/mace/kernels/opencl/image/conv_2d_3x3.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/activation.h" -#include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" #include "mace/utils/utils.h" namespace mace { @@ -60,7 +59,7 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, +extern MaceStatus Conv2dOpenclK3x3(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -73,7 +72,6 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -158,9 +156,9 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context, Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/conv_2d_general.cc b/mace/kernels/opencl/image/conv_2d_general.cc index 0286edf7346837f8d8ea45939e075611b2059c1b..8221814e8f57e91ce530afc8d8152b6bb01a9b11 100644 --- a/mace/kernels/opencl/image/conv_2d_general.cc +++ b/mace/kernels/opencl/image/conv_2d_general.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" -#include "mace/utils/tuner.h" +#include "mace/kernels/activation.h" #include "mace/utils/utils.h" namespace mace { @@ -68,7 +67,7 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpencl(OpKernelContext *context, +extern MaceStatus Conv2dOpencl(OpContext *context, cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -81,7 +80,6 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -170,10 +168,10 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context, std::vector lws = LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/crop.h b/mace/kernels/opencl/image/crop.h index 10aa6ecbf8abdfe173ee43cb6ed2773947d8bcb7..7ab8ce1cfe9a8023d5c3ed83ffad3e1c1ac50b10 100644 --- a/mace/kernels/opencl/image/crop.h +++ b/mace/kernels/opencl/image/crop.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_ #define MACE_KERNELS_OPENCL_IMAGE_CROP_H_ -#include "mace/kernels/crop.h" +#include "mace/kernels/opencl/crop.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -36,10 +38,9 @@ class CropKernel : public OpenCLCropKernel { const std::vector &offset) : axis_(axis), offset_(offset) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_list, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: const int axis_; @@ -51,10 +52,9 @@ class CropKernel : public OpenCLCropKernel { template MaceStatus CropKernel::Compute( - OpKernelContext *context, + OpContext *context, const std::vector &input_list, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const int32_t inputs_count = static_cast(input_list.size()); MACE_CHECK(inputs_count >= 2) << "Crop opencl kernel only support 2 elements input"; @@ -181,9 +181,9 @@ MaceStatus CropKernel::Compute( Concat("crop_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/deconv_2d.h b/mace/kernels/opencl/image/deconv_2d.h index f1ce71c80af3ac4f2d36b88bb993cbe0ff65aac3..eae5978a7795b9c81dee76b05e500e1ea8f39d31 100644 --- a/mace/kernels/opencl/image/deconv_2d.h +++ b/mace/kernels/opencl/image/deconv_2d.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_ #define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_ -#include "mace/kernels/deconv_2d.h" +#include "mace/kernels/opencl/deconv_2d.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,7 +34,7 @@ template class Deconv2dKernel : public OpenCLDeconv2dKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -41,8 +43,7 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel { const ActivationType activation, const float relux_max_limit, const std::vector &output_shape, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -52,7 +53,7 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel { template MaceStatus Deconv2dKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -61,8 +62,7 @@ MaceStatus Deconv2dKernel::Compute( const ActivationType activation, const float relux_max_limit, const std::vector &output_shape, - Tensor *output, - StatsFuture *future) { + Tensor *output) { std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &output_image_shape); @@ -174,10 +174,10 @@ MaceStatus Deconv2dKernel::Compute( Concat("deconv2d_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/depth_to_space.h b/mace/kernels/opencl/image/depth_to_space.h index 280cdaa66703c9ea7a0d6bc4599cc11b0e725547..0a961d53bee64032d5017e6cf2ad45112529d40a 100644 --- a/mace/kernels/opencl/image/depth_to_space.h +++ b/mace/kernels/opencl/image/depth_to_space.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_ #define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_ -#include "mace/kernels/depth_to_space.h" +#include "mace/kernels/opencl/depth_to_space.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -34,10 +36,9 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { explicit DepthToSpaceKernel(const int block_size) : block_size_(block_size) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: const int block_size_; @@ -48,10 +49,9 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel { template MaceStatus DepthToSpaceKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t input_height = input->dim(1); const index_t input_width = input->dim(2); @@ -130,10 +130,10 @@ MaceStatus DepthToSpaceKernel::Compute( output_width, output_depth); const std::vector lws = Default3DLocalWS(runtime, gws, kwg_size_); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/depthwise_conv2d.cc b/mace/kernels/opencl/image/depthwise_conv2d.cc index 00f0102ec10bab243c29b3f6e67355f49635ad22..57953960ab1dc89b96a5e4fddf48b703837b4a1e 100644 --- a/mace/kernels/opencl/image/depthwise_conv2d.cc +++ b/mace/kernels/opencl/image/depthwise_conv2d.cc @@ -63,7 +63,7 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -MaceStatus DepthwiseConv2d(OpKernelContext *context, +MaceStatus DepthwiseConv2d(OpContext *context, cl::Kernel *kernel, const Tensor *input, // NHWC const Tensor *filter, // HWIM @@ -76,7 +76,6 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -181,10 +180,10 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context, std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace depthwise diff --git a/mace/kernels/opencl/image/depthwise_conv2d.h b/mace/kernels/opencl/image/depthwise_conv2d.h index 8b5568f5ae16ef901df72ebb09d6457e4f4aa08a..7bfa9ede7b0d7ff31356c51d086ad35fa827f45d 100644 --- a/mace/kernels/opencl/image/depthwise_conv2d.h +++ b/mace/kernels/opencl/image/depthwise_conv2d.h @@ -14,11 +14,13 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_ #define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_ -#include "mace/kernels/depthwise_conv2d.h" +#include "mace/kernels/opencl/depthwise_conv2d.h" #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -27,7 +29,7 @@ namespace opencl { namespace image { namespace depthwise { -MaceStatus DepthwiseConv2d(OpKernelContext *context, +MaceStatus DepthwiseConv2d(OpContext *context, cl::Kernel *kernel, const Tensor *input, // NHWC const Tensor *filter, // HWIM @@ -40,7 +42,6 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future, uint32_t *kwg_size); } // namespace depthwise @@ -49,7 +50,7 @@ template class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -59,8 +60,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -70,7 +70,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel { template MaceStatus DepthwiseConv2dKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *filter, const Tensor *bias, @@ -80,8 +80,7 @@ MaceStatus DepthwiseConv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) { + Tensor *output) { index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); if (strides[0] != strides[1]) { @@ -120,7 +119,7 @@ MaceStatus DepthwiseConv2dKernel::Compute( return depthwise::DepthwiseConv2d( context, &kernel_, input, filter, bias, strides[0], paddings.data(), dilations, activation, relux_max_limit, DataTypeToEnum::value, - &input_shape_, output, future, &kwg_size_); + &input_shape_, output, &kwg_size_); } } // namespace image diff --git a/mace/kernels/opencl/image/eltwise.h b/mace/kernels/opencl/image/eltwise.h index c2bbc3a53e50ce02c8151a6859d81e9d589278e3..d23526029802b50ca026f28233e6b6a232739e09 100644 --- a/mace/kernels/opencl/image/eltwise.h +++ b/mace/kernels/opencl/image/eltwise.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_ #define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_ -#include "mace/kernels/eltwise.h" +#include "mace/kernels/opencl/eltwise.h" #include #include @@ -22,6 +22,9 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" +#include "mace/kernels/eltwise.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -42,11 +45,10 @@ class EltwiseKernel : public OpenCLEltwiseKernel { scalar_input_(scalar_input), scalar_input_index_(scalar_input_index) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input0, const Tensor *input1, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: EltwiseType type_; @@ -60,11 +62,10 @@ class EltwiseKernel : public OpenCLEltwiseKernel { template MaceStatus EltwiseKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input0, const Tensor *input1, - Tensor *output, - StatsFuture *future) { + Tensor *output) { bool swapped = false; if (input1 != nullptr) { MACE_CHECK(input0->dim_size() == input1->dim_size() || @@ -177,9 +178,9 @@ MaceStatus EltwiseKernel::Compute( Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/fully_connected.h b/mace/kernels/opencl/image/fully_connected.h index d0d921d87021968a95d2a6359534719ca1183b56..605c9ee95cbe2e9827e0b7b1eb3fd6addc7629cb 100644 --- a/mace/kernels/opencl/image/fully_connected.h +++ b/mace/kernels/opencl/image/fully_connected.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_ #define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_ -#include "mace/kernels/fully_connected.h" +#include "mace/kernels/opencl/fully_connected.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,14 +34,13 @@ template class FullyConnectedKernel : public OpenCLFullyConnectedKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *weight, const Tensor *bias, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -50,14 +51,13 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel { template MaceStatus FullyConnectedKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *weight, const Tensor *bias, const ActivationType activation, const float relux_max_limit, - Tensor *output, - StatsFuture *future) { + Tensor *output) { std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, @@ -170,8 +170,8 @@ MaceStatus FullyConnectedKernel::Compute( MACE_OUT_OF_RANGE_VALIDATION; MACE_CL_RET_STATUS(error); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -179,7 +179,7 @@ MaceStatus FullyConnectedKernel::Compute( }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/image_to_buffer.h b/mace/kernels/opencl/image/image_to_buffer.h index 0a345bf584a58c79d4b5a706181dd70aefeb8d7a..da8667f0f02a7867d97056101abc4b0a2b803cc2 100644 --- a/mace/kernels/opencl/image/image_to_buffer.h +++ b/mace/kernels/opencl/image/image_to_buffer.h @@ -19,7 +19,8 @@ #include #include -#include "mace/kernels/buffer_inverse_transform.h" +#include "mace/core/op_context.h" +#include "mace/kernels/opencl/buffer_inverse_transform.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -30,12 +31,11 @@ namespace image { template class ImageToBuffer : public OpenCLBufferInverseTransformKernel { public: - MaceStatus Compute(OpKernelContext *context, + MaceStatus Compute(OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -43,12 +43,11 @@ class ImageToBuffer : public OpenCLBufferInverseTransformKernel { }; template -MaceStatus ImageToBuffer::Compute(OpKernelContext *context, +MaceStatus ImageToBuffer::Compute(OpContext *context, const Tensor *input, const BufferType type, const int wino_blk_size, - Tensor *output, - StatsFuture *future) { + Tensor *output) { auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); @@ -172,8 +171,8 @@ MaceStatus ImageToBuffer::Compute(OpKernelContext *context, } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -181,7 +180,7 @@ MaceStatus ImageToBuffer::Compute(OpKernelContext *context, }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/lstm_cell.h b/mace/kernels/opencl/image/lstm_cell.h index 2b7d41d48f5f86415415525d5644df4e9531b1e2..00b0735648064070456675853dc032280f3a5779 100644 --- a/mace/kernels/opencl/image/lstm_cell.h +++ b/mace/kernels/opencl/image/lstm_cell.h @@ -14,12 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_ #define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_ +#include "mace/kernels/opencl/lstm_cell.h" + #include #include #include #include -#include "mace/kernels/lstmcell.h" +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -34,15 +37,14 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel { const T forget_bias) : forget_bias_(forget_bias) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *pre_output, const Tensor *weight, const Tensor *bias, const Tensor *pre_cell, Tensor *cell, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: T forget_bias_; @@ -53,15 +55,14 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel { template MaceStatus LSTMCellKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *pre_output, const Tensor *weight, const Tensor *bias, const Tensor *pre_cell, Tensor *cell, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0, "LSTM hidden units should be a multiple of 4"); @@ -126,10 +127,10 @@ MaceStatus LSTMCellKernel::Compute( std::string tuning_key = Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/matmul.h b/mace/kernels/opencl/image/matmul.h index 751887529c47fca09dc1f58a8beafc7708539b98..aa68864613ff389ecf112d4e75403c55fc2cfc14 100644 --- a/mace/kernels/opencl/image/matmul.h +++ b/mace/kernels/opencl/image/matmul.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_ #define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_ -#include "mace/kernels/matmul.h" +#include "mace/kernels/opencl/matmul.h" #include #include @@ -22,6 +22,8 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -33,13 +35,12 @@ template class MatMulKernel : public OpenCLMatMulKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *A, const Tensor *B, Tensor *C, bool transpose_a, - bool transpose_b, - StatsFuture *future) override; + bool transpose_b) override; private: cl::Kernel kernel_; @@ -48,14 +49,12 @@ class MatMulKernel : public OpenCLMatMulKernel { template MaceStatus MatMulKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *A, const Tensor *B, Tensor *C, bool transpose_a, - bool transpose_b, - StatsFuture *future) { - MACE_UNUSED(future); + bool transpose_b) { MACE_CHECK(!transpose_a && !transpose_b, "GPU does not support transpose matmul"); @@ -115,10 +114,10 @@ MaceStatus MatMulKernel::Compute( const std::vector lws = {kwg_size_ / 64, 64, 0}; std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/pad.h b/mace/kernels/opencl/image/pad.h index 1533b6d2b8ae850f1144cb5f2e5f78bf6120de0f..b9673e9ef5e2f4ca631d082f6cb62e1f0ae4fb11 100644 --- a/mace/kernels/opencl/image/pad.h +++ b/mace/kernels/opencl/image/pad.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_ #define MACE_KERNELS_OPENCL_IMAGE_PAD_H_ -#include "mace/kernels/pad.h" +#include "mace/kernels/opencl/pad.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -36,10 +38,9 @@ class PadKernel : public OpenCLPadKernel { : paddings_(paddings), constant_value_(constant_value) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: std::vector paddings_; @@ -51,10 +52,9 @@ class PadKernel : public OpenCLPadKernel { template MaceStatus PadKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK(this->paddings_.size() == static_cast((input->dim_size() * 2))); MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) && @@ -122,10 +122,10 @@ MaceStatus PadKernel::Compute( std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/pooling.h b/mace/kernels/opencl/image/pooling.h index 8b11475e7131fa80fbd39e48a570f761c5c15c53..769f3cf89b30ad9b4bb0b213d0091340f3a0ba31 100644 --- a/mace/kernels/opencl/image/pooling.h +++ b/mace/kernels/opencl/image/pooling.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_ #define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_ -#include "mace/kernels/pooling.h" +#include "mace/kernels/opencl/pooling.h" #include #include @@ -22,6 +22,8 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -59,7 +61,7 @@ template class PoolingKernel : public OpenCLPoolingKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const PoolingType pooling_type, const int *kernels, @@ -67,8 +69,7 @@ class PoolingKernel : public OpenCLPoolingKernel { const Padding &padding_type, const std::vector &padding_data, const int *dilations, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -78,7 +79,7 @@ class PoolingKernel : public OpenCLPoolingKernel { template MaceStatus PoolingKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const PoolingType pooling_type, const int *kernels, @@ -86,8 +87,7 @@ MaceStatus PoolingKernel::Compute( const Padding &padding_type, const std::vector &padding_data, const int *dilations, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK(dilations[0] == 1 && dilations[1] == 1) << "Pooling opencl kernel not support dilation yet"; @@ -173,10 +173,10 @@ MaceStatus PoolingKernel::Compute( Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/reduce_mean.h b/mace/kernels/opencl/image/reduce_mean.h index 3c826540ff026315fec84980114d6ad0eaf19cec..7d7c5fbaf2ac8d48b07c8a5a90515b37a4f538ad 100644 --- a/mace/kernels/opencl/image/reduce_mean.h +++ b/mace/kernels/opencl/image/reduce_mean.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_ #define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_ -#include "mace/kernels/reduce_mean.h" +#include "mace/kernels/opencl/reduce_mean.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -36,10 +38,9 @@ class ReduceMeanKernel : public OpenCLReduceMeanKernel { : axis_(axis), keep_dims_(keep_dims) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: const std::vector axis_; @@ -51,10 +52,9 @@ class ReduceMeanKernel : public OpenCLReduceMeanKernel { template MaceStatus ReduceMeanKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK_NOTNULL(input); // MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); MACE_CHECK(input->dim_size() == 4, @@ -157,8 +157,8 @@ MaceStatus ReduceMeanKernel::Compute( MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -166,7 +166,7 @@ MaceStatus ReduceMeanKernel::Compute( }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/resize_bicubic.h b/mace/kernels/opencl/image/resize_bicubic.h index 669f644539d6e5cd97a1561003068c69cec13f20..20d062ac115dc312ced45f7f6b1bfbdbe342bef6 100644 --- a/mace/kernels/opencl/image/resize_bicubic.h +++ b/mace/kernels/opencl/image/resize_bicubic.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_ #define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_ -#include "mace/kernels/resize_bicubic.h" +#include "mace/kernels/opencl/resize_bicubic.h" #include #include @@ -22,7 +22,10 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" +#include "mace/kernels/resize_bicubic.h" namespace mace { namespace kernels { @@ -68,10 +71,9 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { out_width_(out_width) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: bool align_corners_; @@ -84,10 +86,9 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel { template MaceStatus ResizeBicubicKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t in_height = input->dim(1); const index_t in_width = input->dim(2); @@ -113,7 +114,9 @@ MaceStatus ResizeBicubicKernel::Compute( built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); - built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize)); + built_options.emplace( + MakeString("-DTABLE_SIZE=", + mace::kernels::resize_bicubic::kTableSize)); MACE_RETURN_IF_ERROR( runtime->BuildKernel("resize_bicubic", kernel_name, @@ -135,9 +138,11 @@ MaceStatus ResizeBicubicKernel::Compute( MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); + mace::kernels::resize_bicubic::CalculateResizeScale( + in_height, out_height, align_corners_); float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); + mace::kernels::resize_bicubic::CalculateResizeScale( + in_width, out_width, align_corners_); uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); @@ -159,10 +164,10 @@ MaceStatus ResizeBicubicKernel::Compute( Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/resize_bilinear.h b/mace/kernels/opencl/image/resize_bilinear.h index 459babc9f4aacff9e049f38cd1dd2fc5f62343df..d34b7d50672f2ec3927475fdaa07f726844ba2d9 100644 --- a/mace/kernels/opencl/image/resize_bilinear.h +++ b/mace/kernels/opencl/image/resize_bilinear.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_ #define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_ -#include "mace/kernels/resize_bilinear.h" +#include "mace/kernels/opencl/resize_bilinear.h" #include #include @@ -22,7 +22,10 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" +#include "mace/kernels/resize_bilinear.h" namespace mace { namespace kernels { @@ -73,10 +76,9 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { out_width_(out_width) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: bool align_corners_; @@ -89,10 +91,9 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel { template MaceStatus ResizeBilinearKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t in_height = input->dim(1); const index_t in_width = input->dim(2); @@ -138,9 +139,13 @@ MaceStatus ResizeBilinearKernel::Compute( MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); + mace::kernels::resize_bilinear::CalculateResizeScale(in_height, + out_height, + align_corners_); float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); + mace::kernels::resize_bilinear::CalculateResizeScale(in_width, + out_width, + align_corners_); uint32_t idx = 0; MACE_OUT_OF_RANGE_SET_ARGS(kernel_); @@ -162,10 +167,10 @@ MaceStatus ResizeBilinearKernel::Compute( Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/softmax.h b/mace/kernels/opencl/image/softmax.h index 0c3aa61862a39c3d52a1db06406a7c3e46fbfd2f..cf2dd5b4351560a66d1dd242f6aeb971fd3260a0 100644 --- a/mace/kernels/opencl/image/softmax.h +++ b/mace/kernels/opencl/image/softmax.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_ #define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_ -#include "mace/kernels/softmax.h" +#include "mace/kernels/opencl/softmax.h" #include #include @@ -22,6 +22,8 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -58,10 +60,9 @@ template class SoftmaxKernel : public OpenCLSoftmaxKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *logits, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -71,10 +72,9 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel { template MaceStatus SoftmaxKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *logits, - Tensor *output, - StatsFuture *future) { + Tensor *output) { index_t batch = 0; index_t height = 0; index_t width = 0; @@ -137,10 +137,10 @@ MaceStatus SoftmaxKernel::Compute( std::string tuning_key = Concat("softmax_opencl_kernel", batch, height, width, channels); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/space_to_batch.h b/mace/kernels/opencl/image/space_to_batch.h index 89bcdf6ab4e53ca6ba1d31cd36882a4f36b5949c..0a20e6f6dc2194f799428353146f800e3f376b1b 100644 --- a/mace/kernels/opencl/image/space_to_batch.h +++ b/mace/kernels/opencl/image/space_to_batch.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_ #define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_ -#include "mace/kernels/space_to_batch.h" +#include "mace/kernels/opencl/space_to_batch.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,13 +34,12 @@ template class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { public: MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *space_tensor, const std::vector &paddings, const std::vector &block_shape, const std::vector &output_shape, - Tensor *batch_tensor, - StatsFuture *future) override; + Tensor *batch_tensor) override; private: cl::Kernel kernel_; @@ -48,13 +49,12 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel { template MaceStatus SpaceToBatchKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *space_tensor, const std::vector &paddings, const std::vector &block_shape, const std::vector &output_shape, - Tensor *batch_tensor, - StatsFuture *future) { + Tensor *batch_tensor) { std::vector output_image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &output_image_shape); @@ -114,10 +114,10 @@ MaceStatus SpaceToBatchKernel::Compute( Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), batch_tensor->dim(2), batch_tensor->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/space_to_depth.h b/mace/kernels/opencl/image/space_to_depth.h index e1247dc31c5f5364d0431d0c7763a247ff5285c5..2e3f2a747d5fcf36aef8fdbef77a44d2d7fd0265 100644 --- a/mace/kernels/opencl/image/space_to_depth.h +++ b/mace/kernels/opencl/image/space_to_depth.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_ #define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_ -#include "mace/kernels/space_to_depth.h" +#include "mace/kernels/opencl/space_to_depth.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -34,10 +36,9 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { explicit SpaceToDepthKernel(const int block_size) : block_size_(block_size) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: const int block_size_; @@ -48,10 +49,9 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel { template MaceStatus SpaceToDepthKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - Tensor *output, - StatsFuture *future) { + Tensor *output) { const index_t batch = input->dim(0); const index_t input_height = input->dim(1); const index_t input_width = input->dim(2); @@ -124,10 +124,10 @@ MaceStatus SpaceToDepthKernel::Compute( std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), input->dim(1), input->dim(2), input->dim(3)); MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/split.h b/mace/kernels/opencl/image/split.h index a75642a8317dd084234d175d8f29ea3e49836a78..ee7fab71415ff8a8d2d6b7486468e5b53a7577f5 100644 --- a/mace/kernels/opencl/image/split.h +++ b/mace/kernels/opencl/image/split.h @@ -14,7 +14,7 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_ #define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_ -#include "mace/kernels/split.h" +#include "mace/kernels/opencl/split.h" #include #include @@ -22,6 +22,8 @@ #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -34,10 +36,9 @@ class SplitKernel : public OpenCLSplitKernel { public: explicit SplitKernel(const int32_t axis) : axis_(axis) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - const std::vector &output_list, - StatsFuture *future) override; + const std::vector &output_list) override; private: int32_t axis_; @@ -47,10 +48,9 @@ class SplitKernel : public OpenCLSplitKernel { template MaceStatus SplitKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, - const std::vector &output_list, - StatsFuture *future) { + const std::vector &output_list) { const index_t input_channels = input->dim(3); const size_t outputs_count = output_list.size(); const index_t output_channels = input_channels / outputs_count; @@ -123,7 +123,7 @@ MaceStatus SplitKernel::Compute( } MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr && runtime->is_profiling_enabled()) { + if (context->future() != nullptr && runtime->is_profiling_enabled()) { event.wait(); CallStats tmp_stats; runtime->GetCallStats(event, &tmp_stats); @@ -132,8 +132,8 @@ MaceStatus SplitKernel::Compute( call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; } } - if (future != nullptr) { - future->wait_fn = [runtime, call_stats](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, call_stats](CallStats *stats) { if (stats != nullptr) { stats->start_micros = call_stats.start_micros; stats->end_micros = stats->start_micros + call_stats.end_micros; @@ -141,7 +141,7 @@ MaceStatus SplitKernel::Compute( }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/sqrdiff_mean.h b/mace/kernels/opencl/image/sqrdiff_mean.h index 31959a62759a7e8f0422ff24aaa9427be639cc96..3d86b05d3cd6dc8bfc308c1947bfc8eec85718ba 100644 --- a/mace/kernels/opencl/image/sqrdiff_mean.h +++ b/mace/kernels/opencl/image/sqrdiff_mean.h @@ -14,13 +14,15 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_ #define MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_ -#include "mace/kernels/sqrdiff_mean.h" +#include "mace/kernels/opencl/sqrdiff_mean.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -31,14 +33,11 @@ namespace image { template class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { public: - SqrDiffMeanKernel() {} - MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input, const Tensor *input1, - Tensor *output, - StatsFuture *future) override; + Tensor *output) override; private: cl::Kernel kernel_; @@ -48,11 +47,10 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel { template MaceStatus SqrDiffMeanKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input0, const Tensor *input1, - Tensor *output, - StatsFuture *future) { + Tensor *output) { MACE_CHECK_NOTNULL(input0); MACE_CHECK_NOTNULL(input1); MACE_CHECK(input0->dim(0) == input1->dim(0) && @@ -156,8 +154,8 @@ MaceStatus SqrDiffMeanKernel::Compute( MACE_CL_RET_STATUS(error); MACE_OUT_OF_RANGE_VALIDATION; - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { + if (context->future() != nullptr) { + context->future()->wait_fn = [runtime, event](CallStats *stats) { event.wait(); if (stats != nullptr) { runtime->GetCallStats(event, stats); @@ -165,7 +163,7 @@ MaceStatus SqrDiffMeanKernel::Compute( }; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image diff --git a/mace/kernels/opencl/image/winograd_transform.h b/mace/kernels/opencl/image/winograd_transform.h index 107c8dc0cf3f29b4003de29516fc880d995cfe38..f00e5556438446120303261550b54f9b185d75fd 100644 --- a/mace/kernels/opencl/image/winograd_transform.h +++ b/mace/kernels/opencl/image/winograd_transform.h @@ -14,13 +14,17 @@ #ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ #define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ -#include "mace/kernels/winograd_transform.h" +#include "mace/kernels/opencl/winograd_transform.h" #include #include #include #include +#include "mace/core/op_context.h" +#include "mace/core/tensor.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/opencl/helper.h" namespace mace { @@ -32,7 +36,7 @@ template class WinogradTransformKernel : public OpenCLWinogradTransformKernel { public: WinogradTransformKernel( - const Padding &padding_type, + Padding padding_type, const std::vector &paddings, const int block_size) : strides_({1, 1}), @@ -41,10 +45,9 @@ class WinogradTransformKernel : public OpenCLWinogradTransformKernel { paddings_(paddings), wino_blk_size_(block_size) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input_tensor, - Tensor *output_tensor, - StatsFuture *future) override; + Tensor *output_tensor) override; private: const std::vector strides_; // [stride_h, stride_w] @@ -59,10 +62,9 @@ class WinogradTransformKernel : public OpenCLWinogradTransformKernel { template MaceStatus WinogradTransformKernel::Compute( - OpKernelContext *context, + OpContext *context, const Tensor *input_tensor, - Tensor *output_tensor, - StatsFuture *future) { + Tensor *output_tensor) { auto runtime = context->device()->opencl_runtime(); MACE_OUT_OF_RANGE_DEFINITION; @@ -83,7 +85,7 @@ MaceStatus WinogradTransformKernel::Compute( + obfuscated_kernel_name); } else { MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(DataTypeToEnum::value)); @@ -162,10 +164,10 @@ MaceStatus WinogradTransformKernel::Compute( output_tensor->dim(1), output_tensor->dim(2)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } template @@ -173,17 +175,16 @@ class WinogradInverseTransformKernel : public OpenCLWinogradInverseTransformKernel { public: WinogradInverseTransformKernel( - const ActivationType activation, + ActivationType activation, const float relux_max_limit, const int block_size) : wino_blk_size_(block_size), activation_(activation), relux_max_limit_(relux_max_limit) {} MaceStatus Compute( - OpKernelContext *context, + OpContext *context, const std::vector &inputs, - Tensor *output_tensor, - StatsFuture *future) override; + Tensor *output_tensor) override; private: const int wino_blk_size_; @@ -196,10 +197,9 @@ class WinogradInverseTransformKernel template MaceStatus WinogradInverseTransformKernel::Compute( - OpKernelContext *context, + OpContext *context, const std::vector &inputs, - Tensor *output_tensor, - StatsFuture *future) { + Tensor *output_tensor) { auto runtime = context->device()->opencl_runtime(); MACE_OUT_OF_RANGE_DEFINITION; @@ -223,7 +223,7 @@ MaceStatus WinogradInverseTransformKernel::Compute( + obfuscated_kernel_name); } else { MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } built_options.emplace("-DDATA_TYPE=" + @@ -312,10 +312,10 @@ MaceStatus WinogradInverseTransformKernel::Compute( output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3), input_tensor->dim(2)); MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, future)); + gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } } // namespace image } // namespace opencl diff --git a/mace/kernels/opencl/lstm_cell.h b/mace/kernels/opencl/lstm_cell.h new file mode 100644 index 0000000000000000000000000000000000000000..0ce1d26f7c6bd683906c6021a1bc0505e0b1f506 --- /dev/null +++ b/mace/kernels/opencl/lstm_cell.h @@ -0,0 +1,44 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_LSTM_CELL_H_ +#define MACE_KERNELS_OPENCL_LSTM_CELL_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLLSTMCellKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const Tensor *pre_output, + const Tensor *weight, + const Tensor *bias, + const Tensor *pre_cell, + Tensor *cell, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLLSTMCellKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_LSTM_CELL_H_ diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc deleted file mode 100644 index e210ee582bfc8a9cac401919d97fa3784954c467..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/lstmcell.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/lstmcell.h" -#include "mace/kernels/opencl/image/lstm_cell.h" - -namespace mace { -namespace kernels { - -template -LSTMCellFunctor::LSTMCellFunctor( - OpKernelContext *context, - T forget_bias) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::LSTMCellKernel(forget_bias)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus LSTMCellFunctor::operator()( - const Tensor *input, - const Tensor *pre_output, - const Tensor *weight, - const Tensor *bias, - const Tensor *pre_cell, - Tensor *cell, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, pre_output, weight, bias, - pre_cell, cell, output, future); -} - -template struct LSTMCellFunctor; -template struct LSTMCellFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc deleted file mode 100644 index b8ddc1c84e8039a907bad6ade8ce444a20012228..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/matmul.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/matmul.h" -#include "mace/kernels/opencl/image/matmul.h" - -namespace mace { -namespace kernels { - -template -MatMulFunctor::MatMulFunctor(OpKernelContext *context) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::MatMulKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus MatMulFunctor::operator()(const Tensor *A, - const Tensor *B, - Tensor *C, - bool transpose_a, - bool transpose_b, - StatsFuture *future) { - return kernel_->Compute(context_, A, B, C, transpose_a, transpose_b, future); -} - -template struct MatMulFunctor; - -template struct MatMulFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/matmul.h b/mace/kernels/opencl/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..e971328e1bf8f62f1ba73d383e97dca3aa93ae2c --- /dev/null +++ b/mace/kernels/opencl/matmul.h @@ -0,0 +1,42 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_MATMUL_H_ +#define MACE_KERNELS_OPENCL_MATMUL_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLMatMulKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *A, + const Tensor *B, + Tensor *C, + bool transpose_a, + bool transpose_b) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLMatMulKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_MATMUL_H_ diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc index f61b9e87cfe0a91a59478167fcccbce565cbe793..957026b23ac26c5247fd46fb5ca5fe154d62cda4 100644 --- a/mace/kernels/opencl/out_of_range_check_test.cc +++ b/mace/kernels/opencl/out_of_range_check_test.cc @@ -16,7 +16,7 @@ #include #include "gtest/gtest.h" -#include "mace/core/op_kernel_context.h" +#include "mace/core/op_context.h" #include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" @@ -27,10 +27,10 @@ namespace mace { namespace kernels { namespace { -bool BufferToImageOpImpl(OpKernelContext *context, - Tensor *buffer, - Tensor *image, - const std::vector &image_shape) { +MaceStatus BufferToImageOpImpl(OpContext *context, + Tensor *buffer, + Tensor *image, + const std::vector &image_shape) { std::unique_ptr oorc_flag; uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; @@ -59,14 +59,10 @@ bool BufferToImageOpImpl(OpKernelContext *context, } cl::Kernel kernel; - cl_int error = runtime->BuildKernel("buffer_to_image", - obfuscated_kernel_name, - built_options, - &kernel); - if (error != CL_SUCCESS) { - return false; - } - + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image", + obfuscated_kernel_name, + built_options, + &kernel)); MACE_OUT_OF_RANGE_INIT(kernel); uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -89,6 +85,7 @@ bool BufferToImageOpImpl(OpKernelContext *context, static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); const std::vector lws = {16, kwg_size / 16}; + cl_int error; cl::Event event; if (runtime->IsNonUniformWorkgroupsSupported()) { error = runtime->command_queue().enqueueNDRangeKernel( @@ -105,7 +102,7 @@ bool BufferToImageOpImpl(OpKernelContext *context, cl::NDRange(lws[0], lws[1]), nullptr, &event); } if (error != CL_SUCCESS) { - return false; + return MaceStatus::MACE_OUT_OF_RESOURCES; } runtime->command_queue().finish(); @@ -115,7 +112,8 @@ bool BufferToImageOpImpl(OpKernelContext *context, is_out_of_range = *(oorc_flag->mutable_data()) == 1 ? true : false; oorc_flag->UnMap(); } - return is_out_of_range; + return is_out_of_range ? MaceStatus::MACE_OUT_OF_RESOURCES + : MaceStatus::MACE_SUCCESS; } } // namespace @@ -135,7 +133,7 @@ TEST(OutOfRangeCheckTest, RandomTest) { std::unique_ptr device(new GPUDevice(gpu_context.opencl_tuner())); Workspace ws; - OpKernelContext context(&ws, device.get()); + OpContext context(&ws, device.get()); std::vector buffer_shape = {batch, height, width, channels}; Tensor *buffer = @@ -148,7 +146,8 @@ TEST(OutOfRangeCheckTest, RandomTest) { DataTypeToEnum::v()); CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); image->ResizeImage(buffer->shape(), image_shape); - ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape)); + ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape) + != MaceStatus::MACE_SUCCESS); std::vector overflow_image_shape = image_shape; for (size_t i = 0; i < overflow_image_shape.size(); ++i) { @@ -157,7 +156,8 @@ TEST(OutOfRangeCheckTest, RandomTest) { ASSERT_TRUE(BufferToImageOpImpl(&context, buffer, image, - overflow_image_shape)); + overflow_image_shape) + != MaceStatus::MACE_SUCCESS); } } // namespace kernels diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc deleted file mode 100644 index 759b9219ce06d601716b36b90bc03513de51de9b..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/pad.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/pad.h" -#include "mace/kernels/opencl/image/pad.h" - -namespace mace { -namespace kernels { - -template -PadFunctor::PadFunctor( - OpKernelContext *context, - const std::vector &paddings, - const float constant_value) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::PadKernel(paddings, constant_value)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus PadFunctor::operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct PadFunctor; -template struct PadFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/pad.h b/mace/kernels/opencl/pad.h new file mode 100644 index 0000000000000000000000000000000000000000..ec91a446f38c552a64c0c453871ed59df1a3f36b --- /dev/null +++ b/mace/kernels/opencl/pad.h @@ -0,0 +1,38 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_PAD_H_ +#define MACE_KERNELS_OPENCL_PAD_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLPadKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPadKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_PAD_H_ diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc deleted file mode 100644 index aab536643ca9808ca3db22247bfacd3fa8fab916..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/pooling.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/pooling.h" - -#include "mace/kernels/opencl/buffer/pooling.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/kernels/opencl/image/pooling.h" - -namespace mace { -namespace kernels { - -template -PoolingFunctor::PoolingFunctor( - OpKernelContext *context, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations) - : PoolingFunctorBase(context, - pooling_type, - kernels, - strides, - padding_type, - paddings, - dilations) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::PoolingKernel); - } else { - kernel_.reset(new opencl::buffer::PoolingKernel); - } -} - -template -MaceStatus PoolingFunctor::operator()( - const Tensor *input, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, pooling_type_, kernels_, strides_, - padding_type_, paddings_, dilations_, - output, future); -} - -template struct PoolingFunctor; -template struct PoolingFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/pooling.h b/mace/kernels/opencl/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..ce3c8b543b4b5cbab404eca42541b33a716f2ede --- /dev/null +++ b/mace/kernels/opencl/pooling.h @@ -0,0 +1,46 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_POOLING_H_ +#define MACE_KERNELS_OPENCL_POOLING_H_ + +#include + +#include "mace/kernels/pooling.h" +#include "mace/kernels/conv_pool_2d_util.h" + +namespace mace { + +class OpContext; +class Tensor; +namespace kernels { +class OpenCLPoolingKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding &padding_type, + const std::vector &padding_data, + const int *dilations, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPoolingKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_POOLING_H_ diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc deleted file mode 100644 index b504334afb9eb1c9b87305c473548cc43cdeae1b..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/reduce_mean.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/reduce_mean.h" -#include "mace/kernels/opencl/image/reduce_mean.h" - -namespace mace { -namespace kernels { - -template -ReduceMeanFunctor::ReduceMeanFunctor( - OpKernelContext *context, - const std::vector &axis, - const bool keep_dims) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ReduceMeanKernel(axis, keep_dims)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus ReduceMeanFunctor::operator()( - const Tensor *input, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct ReduceMeanFunctor; -template struct ReduceMeanFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/reduce_mean.h b/mace/kernels/opencl/reduce_mean.h new file mode 100644 index 0000000000000000000000000000000000000000..1960aac56f8b059dd1785b93d31416d22bf78f85 --- /dev/null +++ b/mace/kernels/opencl/reduce_mean.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_REDUCE_MEAN_H_ +#define MACE_KERNELS_OPENCL_REDUCE_MEAN_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLReduceMeanKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLReduceMeanKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_REDUCE_MEAN_H_ diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc deleted file mode 100644 index e45ced4bb72e499f2e6a98dcb7d984c4524246fa..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/resize_bicubic.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/resize_bicubic.h" -#include "mace/kernels/opencl/image/resize_bicubic.h" - -namespace mace { -namespace kernels { - -template -ResizeBicubicFunctor::ResizeBicubicFunctor( - OpKernelContext *context, - bool align_corners, - const std::vector &size) - : OpKernel(context) { - MACE_CHECK(size.size() == 2); - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ResizeBicubicKernel(align_corners, - size[0], - size[1])); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus ResizeBicubicFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct ResizeBicubicFunctor; -template struct ResizeBicubicFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/resize_bicubic.h b/mace/kernels/opencl/resize_bicubic.h new file mode 100644 index 0000000000000000000000000000000000000000..bfb6f8b5d98d7b5b79adedc86c95c03ff39312d3 --- /dev/null +++ b/mace/kernels/opencl/resize_bicubic.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_ +#define MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" +#include "mace/core/types.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLResizeBicubicKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBicubicKernel); +}; +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_ diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc deleted file mode 100644 index 585cab767ad05ff3d6666d72db69d38d91db1ec6..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/resize_bilinear.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/resize_bilinear.h" -#include "mace/kernels/opencl/image/resize_bilinear.h" - -namespace mace { -namespace kernels { - -template -ResizeBilinearFunctor::ResizeBilinearFunctor( - OpKernelContext *context, - const std::vector &size, - bool align_corners) : OpKernel(context) { - MACE_CHECK(size.size() == 2); - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ResizeBilinearKernel(align_corners, - size[0], - size[1])); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus ResizeBilinearFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct ResizeBilinearFunctor; -template struct ResizeBilinearFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/resize_bilinear.h b/mace/kernels/opencl/resize_bilinear.h new file mode 100644 index 0000000000000000000000000000000000000000..f60fb282a6041211a9734e324c46530aa88eb375 --- /dev/null +++ b/mace/kernels/opencl/resize_bilinear.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_ +#define MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_ + +#include "mace/core/types.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLResizeBilinearKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBilinearKernel); +}; +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_ diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc deleted file mode 100644 index bad5f1fad75b0b0fd8276e4baebf549393d370d1..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/softmax.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/softmax.h" - -#include "mace/kernels/opencl/buffer/softmax.h" -#include "mace/kernels/opencl/helper.h" -#include "mace/kernels/opencl/image/softmax.h" - -namespace mace { -namespace kernels { - -template -SoftmaxFunctor::SoftmaxFunctor(OpKernelContext *context) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SoftmaxKernel); - } else { - kernel_.reset(new opencl::buffer::SoftmaxKernel); - } -} -template -MaceStatus SoftmaxFunctor::operator()(const Tensor *logits, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, logits, output, future); -} - -template struct SoftmaxFunctor; -template struct SoftmaxFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/softmax.h b/mace/kernels/opencl/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..308b606eda6c3470f865ca3134628b82e41ac57a --- /dev/null +++ b/mace/kernels/opencl/softmax.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_SOFTMAX_H_ +#define MACE_KERNELS_OPENCL_SOFTMAX_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLSoftmaxKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *logits, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSoftmaxKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_SOFTMAX_H_ diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc deleted file mode 100644 index c69db85c73b5c33857f8b7806f2237c16fa4d337..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/space_to_batch.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ -#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ - -#include "mace/kernels/space_to_batch.h" -#include "mace/kernels/opencl/image/space_to_batch.h" - -namespace mace { -namespace kernels { - -template -SpaceToBatchFunctor::SpaceToBatchFunctor( - OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : SpaceToBatchFunctorBase(context, paddings, block_shape) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SpaceToBatchKernel); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus SpaceToBatchFunctor::operator()( - const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) { - std::vector output_shape(4, 0); - CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, - output_shape.data()); - return kernel_->Compute(context_, space_tensor, paddings_, block_shape_, - output_shape, batch_tensor, future); -} - -template struct SpaceToBatchFunctor; -template struct SpaceToBatchFunctor; - -} // namespace kernels -} // namespace mace -#endif // MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ diff --git a/mace/ops/identity.h b/mace/kernels/opencl/space_to_batch.h similarity index 51% rename from mace/ops/identity.h rename to mace/kernels/opencl/space_to_batch.h index be4d75bf48d2c92281fe70d4014fc5b0f5b063fa..22d308ac17be4ca838aae5cce5b8018abe5c59fb 100644 --- a/mace/ops/identity.h +++ b/mace/kernels/opencl/space_to_batch.h @@ -12,36 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_IDENTITY_H_ -#define MACE_OPS_IDENTITY_H_ +#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ +#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ #include -#include "mace/core/operator.h" +#include "mace/core/types.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" namespace mace { -namespace ops { -template -class IdentityOp : public Operator { +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLSpaceToBatchKernel { public: - IdentityOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - output->ReuseTensorBuffer(*input); - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; - } - - private: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); + virtual MaceStatus Compute( + OpContext *context, + const Tensor *space_tensor, + const std::vector &paddings, + const std::vector &block_shape, + const std::vector &output_shape, + Tensor *batch_tensor) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToBatchKernel); }; -} // namespace ops +} // namespace kernels } // namespace mace -#endif // MACE_OPS_IDENTITY_H_ +#endif // MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_ diff --git a/mace/kernels/opencl/space_to_depth.cc b/mace/kernels/opencl/space_to_depth.cc deleted file mode 100644 index 3e14047b9eb4833d500cce289c51197608fc82f4..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/space_to_depth.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/space_to_depth.h" -#include "mace/kernels/opencl/image/space_to_depth.h" - -namespace mace { -namespace kernels { - -template -SpaceToDepthOpFunctor::SpaceToDepthOpFunctor( - OpKernelContext *context, - const int block_size) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SpaceToDepthKernel(block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus SpaceToDepthOpFunctor::operator()( - const Tensor *input, Tensor *output, StatsFuture *future) { - return kernel_->Compute(context_, input, output, future); -} - -template struct SpaceToDepthOpFunctor; -template struct SpaceToDepthOpFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/space_to_depth.h b/mace/kernels/opencl/space_to_depth.h new file mode 100644 index 0000000000000000000000000000000000000000..ea6b16c15f9f73aaae711f8f4c154ef84161139d --- /dev/null +++ b/mace/kernels/opencl/space_to_depth.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_ +#define MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLSpaceToDepthKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToDepthKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_ diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc deleted file mode 100644 index 2f2a046ec0c7cc1929c3c3e3c3fd2642d6685bc0..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/split.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/split.h" -#include "mace/kernels/opencl/image/split.h" - -namespace mace { -namespace kernels { - -template -SplitFunctor::SplitFunctor(OpKernelContext *context, - const int32_t axis) - : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SplitKernel(axis)); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus SplitFunctor::operator()( - const Tensor *input, - const std::vector &output_list, - StatsFuture *future) { - return kernel_->Compute(context_, input, output_list, future); -} - -template struct SplitFunctor; -template struct SplitFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/split.h b/mace/kernels/opencl/split.h new file mode 100644 index 0000000000000000000000000000000000000000..c5cacd6f6a2de2b682d62e6869e53a15008e5f8a --- /dev/null +++ b/mace/kernels/opencl/split.h @@ -0,0 +1,41 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_SPLIT_H_ +#define MACE_KERNELS_OPENCL_SPLIT_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLSplitKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + const std::vector &output_list) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSplitKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_SPLIT_H_ diff --git a/mace/kernels/opencl/sqrdiff_mean.cc b/mace/kernels/opencl/sqrdiff_mean.cc deleted file mode 100644 index a0a6401d86f1e4370e889b16bcf5d9f9c78f4b5d..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/sqrdiff_mean.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/sqrdiff_mean.h" -#include "mace/kernels/opencl/image/sqrdiff_mean.h" - -namespace mace { -namespace kernels { - -template -SqrDiffMeanFunctor::SqrDiffMeanFunctor( - OpKernelContext *context) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::SqrDiffMeanKernel()); - } else { - MACE_NOT_IMPLEMENTED; - } -} - -template -MaceStatus SqrDiffMeanFunctor::operator()( - const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { - return kernel_->Compute(context_, input0, input1, output, future); -} - -template struct SqrDiffMeanFunctor; -template struct SqrDiffMeanFunctor; -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/sqrdiff_mean.h b/mace/kernels/opencl/sqrdiff_mean.h new file mode 100644 index 0000000000000000000000000000000000000000..c2d5d197bff5fff21737c5b9c29c9a650794251c --- /dev/null +++ b/mace/kernels/opencl/sqrdiff_mean.h @@ -0,0 +1,39 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_ +#define MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_ + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { +class OpenCLSqrDiffMeanKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input0, + const Tensor *input1, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSqrDiffMeanKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_ diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc deleted file mode 100644 index f64945a4eb1cd14d1f142a042f861e0d825deb18..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/winograd_transform.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/winograd_transform.h" -#include "mace/kernels/opencl/image/winograd_transform.h" - -namespace mace { -namespace kernels { - -template -WinogradTransformFunctor::WinogradTransformFunctor( - OpKernelContext *context, - const Padding &padding_type, - const std::vector &paddings, - const int block_size) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::WinogradTransformKernel( - padding_type, paddings, block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus WinogradTransformFunctor::operator()( - const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { - return kernel_->Compute(context_, input_tensor, output_tensor, future); -} - -template -WinogradInverseTransformFunctor::WinogradInverseTransformFunctor( // NOLINT(whitespace/line_length) - OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit, - const int block_size) : OpKernel(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::WinogradInverseTransformKernel( - activation, relux_max_limit, block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } -} -template -MaceStatus WinogradInverseTransformFunctor::operator()( - const std::vector &inputs, - Tensor *output_tensor, - StatsFuture *future) { - return kernel_->Compute(context_, inputs, output_tensor, future); -} - -template struct WinogradTransformFunctor; -template struct WinogradTransformFunctor; - -template struct WinogradInverseTransformFunctor; -template struct WinogradInverseTransformFunctor; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/winograd_transform.h b/mace/kernels/opencl/winograd_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..d706e89b12f5516e02349e516514e4cc9a92362f --- /dev/null +++ b/mace/kernels/opencl/winograd_transform.h @@ -0,0 +1,50 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_ +#define MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_ + +#include + +#include "mace/public/mace.h" +#include "mace/utils/utils.h" +namespace mace { + +class OpContext; +class Tensor; + +namespace kernels { + +class OpenCLWinogradTransformKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const Tensor *input, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel); +}; + +class OpenCLWinogradInverseTransformKernel { + public: + virtual MaceStatus Compute( + OpContext *context, + const std::vector &inputs, + Tensor *output) = 0; + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_ diff --git a/mace/kernels/ops_register.cc b/mace/kernels/ops_register.cc new file mode 100644 index 0000000000000000000000000000000000000000..4dba891022f37101e90c738c717dba6391bf776e --- /dev/null +++ b/mace/kernels/ops_register.cc @@ -0,0 +1,132 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/ops_register.h" + +namespace mace { + +namespace kernels { +// Keep in lexicographical order +extern void RegisterActivation(OpRegistryBase *op_registry); +extern void RegisterAddN(OpRegistryBase *op_registry); +extern void RegisterArgMax(OpRegistryBase *op_registry); +extern void RegisterBatchNorm(OpRegistryBase *op_registry); +extern void RegisterBatchToSpaceND(OpRegistryBase *op_registry); +extern void RegisterBiasAdd(OpRegistryBase *op_registry); +extern void RegisterCast(OpRegistryBase *op_registry); +extern void RegisterChannelShuffle(OpRegistryBase *op_registry); +extern void RegisterConcat(OpRegistryBase *op_registry); +extern void RegisterConv2D(OpRegistryBase *op_registry); +extern void RegisterCrop(OpRegistryBase *op_registry); +extern void RegisterDeconv2D(OpRegistryBase *op_registry); +extern void RegisterDepthToSpace(OpRegistryBase *op_registry); +extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry); +extern void RegisterDequantize(OpRegistryBase *op_registry); +extern void RegisterEltwise(OpRegistryBase *op_registry); +extern void RegisterExpandDims(OpRegistryBase *op_registry); +extern void RegisterFill(OpRegistryBase *op_registry); +extern void RegisterFullyConnected(OpRegistryBase *op_registry); +extern void RegisterGather(OpRegistryBase *op_registry); +extern void RegisterIdentity(OpRegistryBase *op_registry); +extern void RegisterInferConv2dShape(OpRegistryBase *op_registry); +extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry); +extern void RegisterMatMul(OpRegistryBase *op_registry); +extern void RegisterPad(OpRegistryBase *op_registry); +extern void RegisterPooling(OpRegistryBase *op_registry); +extern void RegisterQuantize(OpRegistryBase *op_registry); +extern void RegisterReduceMean(OpRegistryBase *op_registry); +extern void RegisterReshape(OpRegistryBase *op_registry); +extern void RegisterResizeBicubic(OpRegistryBase *op_registry); +extern void RegisterResizeBilinear(OpRegistryBase *op_registry); +extern void RegisterReverse(OpRegistryBase *op_registry); +extern void RegisterScalarMath(OpRegistryBase *op_registry); +extern void RegisterShape(OpRegistryBase *op_registry); +extern void RegisterSoftmax(OpRegistryBase *op_registry); +extern void RegisterSpaceToBatchND(OpRegistryBase *op_registry); +extern void RegisterSpaceToDepth(OpRegistryBase *op_registry); +extern void RegisterSplit(OpRegistryBase *op_registry); +extern void RegisterSqrDiffMean(OpRegistryBase *op_registry); +extern void RegisterSqueeze(OpRegistryBase *op_registry); +extern void RegisterStack(OpRegistryBase *op_registry); +extern void RegisterStridedSlice(OpRegistryBase *op_registry); +extern void RegisterTranspose(OpRegistryBase *op_registry); +extern void RegisterUnstack(OpRegistryBase *op_registry); +#ifdef MACE_ENABLE_OPENCL +extern void RegisterBufferTransform(OpRegistryBase *op_registry); +extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry); +extern void RegisterLSTMCell(OpRegistryBase *op_registry); +extern void RegisterWinogradInverseTransform(OpRegistryBase *op_registry); +extern void RegisterWinogradTransform(OpRegistryBase *op_registry); + +#endif // MACE_ENABLE_OPENCL +} // namespace kernels + + +OpRegistry::OpRegistry() : OpRegistryBase() { + // Keep in lexicographical order + kernels::RegisterActivation(this); + kernels::RegisterAddN(this); + kernels::RegisterArgMax(this); + kernels::RegisterBatchNorm(this); + kernels::RegisterBatchToSpaceND(this); + kernels::RegisterBiasAdd(this); + kernels::RegisterCast(this); + kernels::RegisterChannelShuffle(this); + kernels::RegisterConcat(this); + kernels::RegisterConv2D(this); + kernels::RegisterCrop(this); + kernels::RegisterDeconv2D(this); + kernels::RegisterDepthToSpace(this); + kernels::RegisterDepthwiseConv2d(this); + kernels::RegisterDequantize(this); + kernels::RegisterEltwise(this); + kernels::RegisterExpandDims(this); + kernels::RegisterFill(this); + kernels::RegisterFullyConnected(this); + kernels::RegisterGather(this); + kernels::RegisterIdentity(this); + kernels::RegisterInferConv2dShape(this); + kernels::RegisterLocalResponseNorm(this); + kernels::RegisterMatMul(this); + kernels::RegisterPad(this); + kernels::RegisterPooling(this); + kernels::RegisterQuantize(this); + kernels::RegisterReduceMean(this); + kernels::RegisterReshape(this); + kernels::RegisterResizeBicubic(this); + kernels::RegisterResizeBilinear(this); + kernels::RegisterReverse(this); + kernels::RegisterScalarMath(this); + kernels::RegisterShape(this); + kernels::RegisterSoftmax(this); + kernels::RegisterSpaceToBatchND(this); + kernels::RegisterSpaceToDepth(this); + kernels::RegisterSplit(this); + kernels::RegisterStack(this); + kernels::RegisterStridedSlice(this); + kernels::RegisterSqrDiffMean(this); + kernels::RegisterSqueeze(this); + kernels::RegisterTranspose(this); + kernels::RegisterUnstack(this); +#ifdef MACE_ENABLE_OPENCL + kernels::RegisterBufferTransform(this); + kernels::RegisterBufferInverseTransform(this); + kernels::RegisterLSTMCell(this); + kernels::RegisterWinogradInverseTransform(this); + kernels::RegisterWinogradTransform(this); + +#endif // MACE_ENABLE_OPENCL +} + +} // namespace mace diff --git a/mace/ops/ops_register.h b/mace/kernels/ops_register.h similarity index 76% rename from mace/ops/ops_register.h rename to mace/kernels/ops_register.h index 9369fde5d7a717a8e74a155253f838eecf0e96cb..e3576adb56e2be58bc4615350f111c5bf8e2f891 100644 --- a/mace/ops/ops_register.h +++ b/mace/kernels/ops_register.h @@ -12,19 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPS_REGISTER_H_ -#define MACE_OPS_OPS_REGISTER_H_ +#ifndef MACE_KERNELS_OPS_REGISTER_H_ +#define MACE_KERNELS_OPS_REGISTER_H_ #include "mace/core/operator.h" namespace mace { -class OperatorRegistry : public OperatorRegistryBase { +class OpRegistry : public OpRegistryBase { public: - OperatorRegistry(); - ~OperatorRegistry() = default; + OpRegistry(); + ~OpRegistry() = default; }; } // namespace mace -#endif // MACE_OPS_OPS_REGISTER_H_ +#endif // MACE_KERNELS_OPS_REGISTER_H_ diff --git a/mace/kernels/pad.cc b/mace/kernels/pad.cc new file mode 100644 index 0000000000000000000000000000000000000000..9024eb0f8ed27f3703b45346018e0a940ed63313 --- /dev/null +++ b/mace/kernels/pad.cc @@ -0,0 +1,130 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/pad.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +template +class PadOp; + +template +class PadOp : public Operation { + public: + explicit PadOp(OpConstructContext *context) + : Operation(context), + paddings_(Operation::GetRepeatedArgs("paddings")), + constant_value_(Operation::GetOptionalArg( + "constant_value", 0.0)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK( + this->paddings_.size() == static_cast(input->dim_size()) * 2); + auto input_shape = input->shape(); + MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0] + + this->paddings_[1], + input_shape[1] + this->paddings_[2] + + this->paddings_[3], + input_shape[2] + this->paddings_[4] + + this->paddings_[5], + input_shape[3] + this->paddings_[6] + + this->paddings_[7]})); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_ptr = input->data(); + T *output_ptr = output->mutable_data(); + std::fill(output_ptr, output_ptr + output->size(), this->constant_value_); + + const index_t batch = input->dim(0); + const index_t channel = input->dim(1); + const index_t height = input->dim(2); + const index_t width = input->dim(3); +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channel; ++c) { + for (index_t h = 0; h < height; ++h) { + const index_t in_offset = (((b * channel + c) * height) + h) * width; + const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1) + + (c + this->paddings_[2])) * output->dim(2) + + (h + this->paddings_[4])) * output->dim(3) + + this->paddings_[6]; + memcpy(output_ptr + out_offset, + input_ptr + in_offset, + width * sizeof(T)); + } + } + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + std::vector paddings_; + float constant_value_; +}; + +#ifdef MACE_ENABLE_OPENCL +template +class PadOp : public Operation { + public: + explicit PadOp(OpConstructContext *context) + : Operation(context) { + std::vector paddings = Operation::GetRepeatedArgs("paddings"); + float constant_value = Operation::GetOptionalArg( + "constant_value", 0.0); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::PadKernel(paddings, constant_value)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + return kernel_->Compute(context, input, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterPad(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Pad", PadOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Pad", PadOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Pad", PadOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h deleted file mode 100644 index 23d60bf439ba4c6ae4ec0b7da4178ec19250c10f..0000000000000000000000000000000000000000 --- a/mace/kernels/pad.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_PAD_H_ -#define MACE_KERNELS_PAD_H_ - -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" - -namespace mace { -namespace kernels { - -template -struct PadFunctor : OpKernel { - PadFunctor(OpKernelContext *context, - const std::vector &paddings, - const float constant_value) - : OpKernel(context), - paddings_(paddings), - constant_value_(constant_value) {} - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK( - this->paddings_.size() == static_cast(input->dim_size()) * 2); - auto input_shape = input->shape(); - MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0] - + this->paddings_[1], - input_shape[1] + this->paddings_[2] - + this->paddings_[3], - input_shape[2] + this->paddings_[4] - + this->paddings_[5], - input_shape[3] + this->paddings_[6] - + this->paddings_[7]})); - - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard output_guard(output); - auto input_ptr = input->data(); - T *output_ptr = output->mutable_data(); - std::fill(output_ptr, output_ptr + output->size(), this->constant_value_); - - const index_t batch = input->dim(0); - const index_t channel = input->dim(1); - const index_t height = input->dim(2); - const index_t width = input->dim(3); -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t h = 0; h < height; ++h) { - const index_t in_offset = (((b * channel + c) * height) + h) * width; - const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1) - + (c + this->paddings_[2])) * output->dim(2) - + (h + this->paddings_[4])) * output->dim(3) - + this->paddings_[6]; - memcpy(output_ptr + out_offset, - input_ptr + in_offset, - width * sizeof(T)); - } - } - } - - return MACE_SUCCESS; - } - - std::vector paddings_; - float constant_value_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLPadKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPadKernel); -}; -template -struct PadFunctor : OpKernel { - PadFunctor(OpKernelContext *context, - const std::vector &paddings, - const float constant_value); - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_PAD_H_ diff --git a/mace/kernels/pooling.cc b/mace/kernels/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..07d41d114214391bfd4090602c1134c46e49b29b --- /dev/null +++ b/mace/kernels/pooling.cc @@ -0,0 +1,467 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include "mace/kernels/pooling.h" + +#include +#include +#include +#include + +#include "mace/core/future.h" +#include "mace/core/operator.h" +#include "mace/core/tensor.h" +#include "mace/kernels/conv_pool_2d_base.h" +#include "mace/kernels/conv_pool_2d_util.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/pooling.h" +#include "mace/kernels/opencl/buffer/pooling.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +class PoolingOpBase : public ConvPool2dOpBase { + public: + explicit PoolingOpBase(OpConstructContext *context) + : ConvPool2dOpBase(context), + kernels_(Operation::GetRepeatedArgs("kernels")), + pooling_type_( + static_cast(Operation::GetOptionalArg( + "pooling_type", static_cast(AVG)))) {} + + protected: + std::vector kernels_; + PoolingType pooling_type_; + + MACE_OP_INPUT_TAGS(INPUT); + MACE_OP_OUTPUT_TAGS(OUTPUT); +}; + +template +class PoolingOp; + +template <> +class PoolingOp : public PoolingOpBase { + public: + explicit PoolingOp(OpConstructContext *context) + : PoolingOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input_tensor = this->Input(0); + Tensor *output_tensor = this->Output(0); + std::vector output_shape(4); + std::vector filter_shape = { + input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]}; + + std::vector paddings(2); + if (paddings_.empty()) { + kernels::CalcNCHWPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_.data(), + strides_.data(), padding_type_, output_shape.data(), paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input_tensor->shape().data(), + filter_shape.data(), + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::CEIL, + output_shape.data()); + } + MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); + + Tensor::MappingGuard input_guard(input_tensor); + Tensor::MappingGuard output_guard(output_tensor); + const float *input = input_tensor->data(); + float *output = output_tensor->mutable_data(); + const index_t *input_shape = input_tensor->shape().data(); + int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; + + if (pooling_type_ == PoolingType::MAX) { + MaxPooling(input, + input_shape, + output_shape.data(), + kernels_.data(), + strides_.data(), + dilations_.data(), + pad_hw, + output); + } else if (pooling_type_ == PoolingType::AVG) { + AvgPooling(input, + input_shape, + output_shape.data(), + kernels_.data(), + strides_.data(), + dilations_.data(), + pad_hw, + output); + } else { + MACE_NOT_IMPLEMENTED; + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + void MaxPooling(const float *input, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, + float *output) { + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = std::numeric_limits::lowest(); + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res = std::max(res, input[input_offset]); + } + } + } + output[out_offset] = res; + } + } + } + } + } + + void AvgPooling(const float *input, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *dilation_hw, + const int *pad_hw, + float *output) { + const index_t in_image_size = in_shape[2] * in_shape[3]; + const index_t out_image_size = out_shape[2] * out_shape[3]; + const index_t in_batch_size = in_shape[1] * in_image_size; + const index_t out_batch_size = out_shape[1] * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t c = 0; c < out_shape[1]; ++c) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + const index_t in_height = in_shape[2]; + const index_t in_width = in_shape[3]; + const index_t out_height = out_shape[2]; + const index_t out_width = out_shape[3]; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = 0; + int block_size = 0; + for (int fh = 0; fh < filter_hw[0]; ++fh) { + for (int fw = 0; fw < filter_hw[1]; ++fw) { + index_t inh = + h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; + index_t inw = + w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res += input[input_offset]; + ++block_size; + } + } + } + output[out_offset] = res / block_size; + } + } + } + } + } +}; + +template <> +class PoolingOp : public PoolingOpBase { + public: + explicit PoolingOp(OpConstructContext *context) + : PoolingOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input_tensor = this->Input(0); + Tensor *output_tensor = this->Output(0); + MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1, + "Quantized pooling does not support dilation > 1 yet."); + // Use the same scale and zero point with input and output. + output_tensor->SetScale(input_tensor->scale()); + output_tensor->SetZeroPoint(input_tensor->zero_point()); + + std::vector output_shape(4); + std::vector filter_shape = { + input_tensor->dim(3), kernels_[0], kernels_[1], input_tensor->dim(3)}; + + std::vector paddings(2); + if (paddings_.empty()) { + CalcPaddingAndOutputSize(input_tensor->shape().data(), + NHWC, + filter_shape.data(), + OHWI, + dilations_.data(), + strides_.data(), + padding_type_, + output_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcOutputSize(input_tensor->shape().data(), + NHWC, + filter_shape.data(), + OHWI, + paddings_.data(), + dilations_.data(), + strides_.data(), + RoundType::CEIL, + output_shape.data()); + } + MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); + + const index_t out_channels = output_tensor->dim(3); + const index_t in_channels = input_tensor->dim(3); + MACE_CHECK(out_channels == in_channels); + + Tensor::MappingGuard input_guard(input_tensor); + Tensor::MappingGuard output_guard(output_tensor); + const uint8_t *input = input_tensor->data(); + uint8_t *output = output_tensor->mutable_data(); + int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; + + if (pooling_type_ == PoolingType::MAX) { + MaxPooling(input, + input_tensor->shape().data(), + output_shape.data(), + kernels_.data(), + strides_.data(), + pad_hw, + output); + } else if (pooling_type_ == PoolingType::AVG) { + AvgPooling(input, + input_tensor->shape().data(), + output_shape.data(), + kernels_.data(), + strides_.data(), + pad_hw, + output); + } else { + MACE_NOT_IMPLEMENTED; + } + + return MaceStatus::MACE_SUCCESS; + } + + private: + void MaxPooling(const uint8_t *input, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *pad_hw, + uint8_t *output) { +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t h = 0; h < out_shape[1]; ++h) { + for (index_t w = 0; w < out_shape[2]; ++w) { + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t channels = out_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; + const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; + const index_t in_h_begin = std::max(0, in_h_base); + const index_t in_w_begin = std::max(0, in_w_base); + const index_t in_h_end = + std::min(in_height, in_h_base + filter_hw[0]); + const index_t in_w_end = + std::min(in_width, in_w_base + filter_hw[1]); + + uint8_t *out_ptr = + output + ((b * out_height + h) * out_width + w) * channels; + std::fill_n(out_ptr, channels, 0); + for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { + for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { + const uint8_t *in_ptr = input + + ((b * in_height + ih) * in_width + iw) * channels; + index_t c = 0; +#if defined(MACE_ENABLE_NEON) + for (; c <= channels - 16; c += 16) { + uint8x16_t out_vec = vld1q_u8(out_ptr + c); + uint8x16_t in_vec = vld1q_u8(in_ptr + c); + out_vec = vmaxq_u8(out_vec, in_vec); + vst1q_u8(out_ptr + c, out_vec); + } + for (; c <= channels - 8; c += 8) { + uint8x8_t out_vec = vld1_u8(out_ptr + c); + uint8x8_t in_vec = vld1_u8(in_ptr + c); + out_vec = vmax_u8(out_vec, in_vec); + vst1_u8(out_ptr + c, out_vec); + } +#endif + for (; c < channels; ++c) { + out_ptr[c] = std::max(out_ptr[c], in_ptr[c]); + } + } + } + } + } + } + } + + void AvgPooling(const uint8_t *input, + const index_t *in_shape, + const index_t *out_shape, + const int *filter_hw, + const int *stride_hw, + const int *pad_hw, + uint8_t *output) { +#pragma omp parallel for collapse(3) + for (index_t b = 0; b < out_shape[0]; ++b) { + for (index_t h = 0; h < out_shape[1]; ++h) { + for (index_t w = 0; w < out_shape[2]; ++w) { + const index_t out_height = out_shape[1]; + const index_t out_width = out_shape[2]; + const index_t channels = out_shape[3]; + const index_t in_height = in_shape[1]; + const index_t in_width = in_shape[2]; + const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; + const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; + const index_t in_h_begin = std::max(0, in_h_base); + const index_t in_w_begin = std::max(0, in_w_base); + const index_t in_h_end = + std::min(in_height, in_h_base + filter_hw[0]); + const index_t in_w_end = + std::min(in_width, in_w_base + filter_hw[1]); + const index_t block_size = + (in_h_end - in_h_begin) * (in_w_end - in_w_begin); + MACE_CHECK(block_size > 0); + + std::vector average_buffer(channels); + uint16_t *avg_buffer = average_buffer.data(); + std::fill_n(avg_buffer, channels, 0); + for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { + for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { + const uint8_t *in_ptr = input + + ((b * in_height + ih) * in_width + iw) * channels; + index_t c = 0; +#if defined(MACE_ENABLE_NEON) + for (; c <= channels - 16; c += 16) { + uint16x8_t avg_vec[2]; + avg_vec[0] = vld1q_u16(avg_buffer + c); + avg_vec[1] = vld1q_u16(avg_buffer + c + 8); + uint8x16_t in_vec = vld1q_u8(in_ptr + c); + avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec)); + avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec)); + vst1q_u16(avg_buffer + c, avg_vec[0]); + vst1q_u16(avg_buffer + c + 8, avg_vec[1]); + } + for (; c <= channels - 8; c += 8) { + uint16x8_t avg_vec = vld1q_u16(avg_buffer + c); + uint8x8_t in_vec = vld1_u8(in_ptr + c); + avg_vec = vaddw_u8(avg_vec, in_vec); + vst1q_u16(avg_buffer + c, avg_vec); + } +#endif + for (; c < channels; ++c) { + avg_buffer[c] += in_ptr[c]; + } + } + } + uint8_t *out_ptr = + output + ((b * out_height + h) * out_width + w) * channels; + for (index_t c = 0; c < channels; ++c) { + out_ptr[c] = static_cast( + (avg_buffer[c] + block_size / 2) / block_size); + } + } + } + } + } +}; + +#ifdef MACE_ENABLE_OPENCL +template +class PoolingOp : public PoolingOpBase { + public: + explicit PoolingOp(OpConstructContext *context) + : PoolingOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::PoolingKernel); + } else { + kernel_.reset(new opencl::buffer::PoolingKernel); + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + return kernel_->Compute(context, input, pooling_type_, kernels_.data(), + strides_.data(), padding_type_, paddings_, + dilations_.data(), output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + + +void RegisterPooling(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, + DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 000fa269bcf7c97f3a3ac6f38abac25ce3330015..9780907c818aefc4f7e43936a2b6d5aceaf73a69 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -15,476 +15,12 @@ #ifndef MACE_KERNELS_POOLING_H_ #define MACE_KERNELS_POOLING_H_ -#include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/conv_pool_2d_util.h" -#include "mace/kernels/kernel.h" - -#if defined(MACE_ENABLE_NEON) -#include -#endif namespace mace { - enum PoolingType { AVG = 1, // avg_pool MAX = 2, // max_pool }; - -namespace kernels { - -struct PoolingFunctorBase : OpKernel { - PoolingFunctorBase(OpKernelContext *context, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations) - : OpKernel(context), - pooling_type_(pooling_type), - kernels_(kernels), - strides_(strides), - padding_type_(padding_type), - paddings_(paddings), - dilations_(dilations) {} - - const PoolingType pooling_type_; - const int *kernels_; - const int *strides_; - const Padding padding_type_; - std::vector paddings_; - const int *dilations_; -}; - -template -struct PoolingFunctor; - -template <> -struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(OpKernelContext *context, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations) - : PoolingFunctorBase(context, - pooling_type, - kernels, - strides, - padding_type, - paddings, - dilations) {} - - void MaxPooling(const float *input, - const index_t *in_shape, - const index_t *out_shape, - const int *filter_hw, - const int *stride_hw, - const int *dilation_hw, - const int *pad_hw, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < out_shape[1]; ++c) { - const index_t out_base = b * out_batch_size + c * out_image_size; - const index_t in_base = b * in_batch_size + c * in_image_size; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - const index_t out_offset = out_base + h * out_width + w; - float res = std::numeric_limits::lowest(); - for (int fh = 0; fh < filter_hw[0]; ++fh) { - for (int fw = 0; fw < filter_hw[1]; ++fw) { - index_t inh = - h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; - index_t inw = - w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; - if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { - index_t input_offset = in_base + inh * in_width + inw; - res = std::max(res, input[input_offset]); - } - } - } - output[out_offset] = res; - } - } - } - } - } - - void AvgPooling(const float *input, - const index_t *in_shape, - const index_t *out_shape, - const int *filter_hw, - const int *stride_hw, - const int *dilation_hw, - const int *pad_hw, - float *output) { - const index_t in_image_size = in_shape[2] * in_shape[3]; - const index_t out_image_size = out_shape[2] * out_shape[3]; - const index_t in_batch_size = in_shape[1] * in_image_size; - const index_t out_batch_size = out_shape[1] * out_image_size; - -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t c = 0; c < out_shape[1]; ++c) { - const index_t out_base = b * out_batch_size + c * out_image_size; - const index_t in_base = b * in_batch_size + c * in_image_size; - const index_t in_height = in_shape[2]; - const index_t in_width = in_shape[3]; - const index_t out_height = out_shape[2]; - const index_t out_width = out_shape[3]; - for (index_t h = 0; h < out_height; ++h) { - for (index_t w = 0; w < out_width; ++w) { - const index_t out_offset = out_base + h * out_width + w; - float res = 0; - int block_size = 0; - for (int fh = 0; fh < filter_hw[0]; ++fh) { - for (int fw = 0; fw < filter_hw[1]; ++fw) { - index_t inh = - h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0]; - index_t inw = - w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1]; - if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { - index_t input_offset = in_base + inh * in_width + inw; - res += input[input_offset]; - ++block_size; - } - } - } - output[out_offset] = res / block_size; - } - } - } - } - } - - MaceStatus operator()(const Tensor *input_tensor, // NCHW - Tensor *output_tensor, // NCHW - StatsFuture *future) { - MACE_UNUSED(future); - std::vector output_shape(4); - std::vector filter_shape = { - input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]}; - - std::vector paddings(2); - if (paddings_.empty()) { - kernels::CalcNCHWPaddingAndOutputSize( - input_tensor->shape().data(), filter_shape.data(), dilations_, - strides_, padding_type_, output_shape.data(), paddings.data()); - } else { - paddings = paddings_; - CalcNCHWOutputSize(input_tensor->shape().data(), - filter_shape.data(), - paddings_.data(), - dilations_, - strides_, - RoundType::CEIL, - output_shape.data()); - } - MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); - - Tensor::MappingGuard input_guard(input_tensor); - Tensor::MappingGuard output_guard(output_tensor); - const float *input = input_tensor->data(); - float *output = output_tensor->mutable_data(); - const index_t *input_shape = input_tensor->shape().data(); - int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; - - if (pooling_type_ == PoolingType::MAX) { - MaxPooling(input, - input_shape, - output_shape.data(), - kernels_, - strides_, - dilations_, - pad_hw, - output); - } else if (pooling_type_ == PoolingType::AVG) { - AvgPooling(input, - input_shape, - output_shape.data(), - kernels_, - strides_, - dilations_, - pad_hw, - output); - } else { - MACE_NOT_IMPLEMENTED; - } - - return MACE_SUCCESS; - } -}; - -template <> -struct PoolingFunctor: PoolingFunctorBase { - PoolingFunctor(OpKernelContext *context, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations) - : PoolingFunctorBase(context, - pooling_type, - kernels, - strides, - padding_type, - paddings, - dilations) {} - - void MaxPooling(const uint8_t *input, - const index_t *in_shape, - const index_t *out_shape, - const int *filter_hw, - const int *stride_hw, - const int *pad_hw, - uint8_t *output) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t channels = out_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; - const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; - const index_t in_h_begin = std::max(0, in_h_base); - const index_t in_w_begin = std::max(0, in_w_base); - const index_t in_h_end = - std::min(in_height, in_h_base + filter_hw[0]); - const index_t in_w_end = - std::min(in_width, in_w_base + filter_hw[1]); - - uint8_t *out_ptr = - output + ((b * out_height + h) * out_width + w) * channels; - std::fill_n(out_ptr, channels, 0); - for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { - for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { - const uint8_t *in_ptr = input + - ((b * in_height + ih) * in_width + iw) * channels; - index_t c = 0; -#if defined(MACE_ENABLE_NEON) - for (; c <= channels - 16; c += 16) { - uint8x16_t out_vec = vld1q_u8(out_ptr + c); - uint8x16_t in_vec = vld1q_u8(in_ptr + c); - out_vec = vmaxq_u8(out_vec, in_vec); - vst1q_u8(out_ptr + c, out_vec); - } - for (; c <= channels - 8; c += 8) { - uint8x8_t out_vec = vld1_u8(out_ptr + c); - uint8x8_t in_vec = vld1_u8(in_ptr + c); - out_vec = vmax_u8(out_vec, in_vec); - vst1_u8(out_ptr + c, out_vec); - } -#endif - for (; c < channels; ++c) { - out_ptr[c] = std::max(out_ptr[c], in_ptr[c]); - } - } - } - } - } - } - } - - void AvgPooling(const uint8_t *input, - const index_t *in_shape, - const index_t *out_shape, - const int *filter_hw, - const int *stride_hw, - const int *pad_hw, - uint8_t *output) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < out_shape[0]; ++b) { - for (index_t h = 0; h < out_shape[1]; ++h) { - for (index_t w = 0; w < out_shape[2]; ++w) { - const index_t out_height = out_shape[1]; - const index_t out_width = out_shape[2]; - const index_t channels = out_shape[3]; - const index_t in_height = in_shape[1]; - const index_t in_width = in_shape[2]; - const index_t in_h_base = h * stride_hw[0] - pad_hw[0]; - const index_t in_w_base = w * stride_hw[1] - pad_hw[1]; - const index_t in_h_begin = std::max(0, in_h_base); - const index_t in_w_begin = std::max(0, in_w_base); - const index_t in_h_end = - std::min(in_height, in_h_base + filter_hw[0]); - const index_t in_w_end = - std::min(in_width, in_w_base + filter_hw[1]); - const index_t block_size = - (in_h_end - in_h_begin) * (in_w_end - in_w_begin); - MACE_CHECK(block_size > 0); - - std::vector average_buffer(channels); - uint16_t *avg_buffer = average_buffer.data(); - std::fill_n(avg_buffer, channels, 0); - for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { - for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { - const uint8_t *in_ptr = input + - ((b * in_height + ih) * in_width + iw) * channels; - index_t c = 0; -#if defined(MACE_ENABLE_NEON) - for (; c <= channels - 16; c += 16) { - uint16x8_t avg_vec[2]; - avg_vec[0] = vld1q_u16(avg_buffer + c); - avg_vec[1] = vld1q_u16(avg_buffer + c + 8); - uint8x16_t in_vec = vld1q_u8(in_ptr + c); - avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec)); - avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec)); - vst1q_u16(avg_buffer + c, avg_vec[0]); - vst1q_u16(avg_buffer + c + 8, avg_vec[1]); - } - for (; c <= channels - 8; c += 8) { - uint16x8_t avg_vec = vld1q_u16(avg_buffer + c); - uint8x8_t in_vec = vld1_u8(in_ptr + c); - avg_vec = vaddw_u8(avg_vec, in_vec); - vst1q_u16(avg_buffer + c, avg_vec); - } -#endif - for (; c < channels; ++c) { - avg_buffer[c] += in_ptr[c]; - } - } - } - uint8_t *out_ptr = - output + ((b * out_height + h) * out_width + w) * channels; - for (index_t c = 0; c < channels; ++c) { - out_ptr[c] = static_cast( - (avg_buffer[c] + block_size / 2) / block_size); - } - } - } - } - } - - MaceStatus operator()(const Tensor *input_tensor, // NHWC - Tensor *output_tensor, // NHWC - StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1, - "Quantized pooling does not support dilation > 1 yet."); - // Use the same scale and zero point with input and output. - output_tensor->SetScale(input_tensor->scale()); - output_tensor->SetZeroPoint(input_tensor->zero_point()); - - std::vector output_shape(4); - std::vector filter_shape = { - input_tensor->dim(3), kernels_[0], kernels_[1], input_tensor->dim(3)}; - - std::vector paddings(2); - if (paddings_.empty()) { - CalcPaddingAndOutputSize(input_tensor->shape().data(), - NHWC, - filter_shape.data(), - OHWI, - dilations_, - strides_, - padding_type_, - output_shape.data(), - paddings.data()); - } else { - paddings = paddings_; - CalcOutputSize(input_tensor->shape().data(), - NHWC, - filter_shape.data(), - OHWI, - paddings_.data(), - dilations_, - strides_, - RoundType::CEIL, - output_shape.data()); - } - MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape)); - - const index_t out_channels = output_tensor->dim(3); - const index_t in_channels = input_tensor->dim(3); - MACE_CHECK(out_channels == in_channels); - - Tensor::MappingGuard input_guard(input_tensor); - Tensor::MappingGuard output_guard(output_tensor); - const uint8_t *input = input_tensor->data(); - uint8_t *output = output_tensor->mutable_data(); - int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2}; - - if (pooling_type_ == PoolingType::MAX) { - MaxPooling(input, - input_tensor->shape().data(), - output_shape.data(), - kernels_, - strides_, - pad_hw, - output); - } else if (pooling_type_ == PoolingType::AVG) { - AvgPooling(input, - input_tensor->shape().data(), - output_shape.data(), - kernels_, - strides_, - pad_hw, - output); - } else { - MACE_NOT_IMPLEMENTED; - } - - return MACE_SUCCESS; - } -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLPoolingKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding &padding_type, - const std::vector &padding_data, - const int *dilations, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPoolingKernel); -}; -template -struct PoolingFunctor : PoolingFunctorBase { - PoolingFunctor(OpKernelContext *context, - const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding_type, - const std::vector &paddings, - const int *dilations); - - MaceStatus operator()(const Tensor *input_tensor, - Tensor *output_tensor, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels } // namespace mace #endif // MACE_KERNELS_POOLING_H_ diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h deleted file mode 100644 index aa002988a53f3145f945b145432da2d21ae34f01..0000000000000000000000000000000000000000 --- a/mace/kernels/proposal.h +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_PROPOSAL_H_ -#define MACE_KERNELS_PROPOSAL_H_ - -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" - -namespace mace { -namespace kernels { - -inline std::vector WHCenters(const std::vector &anchor) { - // width, height, width_center, height_center - std::vector window(4); - window[0] = anchor[2] - anchor[0] + 1; - window[1] = anchor[3] - anchor[1] + 1; - window[2] = anchor[0] + (window[0] - 1) / 2; - window[3] = anchor[1] + (window[1] - 1) / 2; - return window; -} - -inline std::vector> GenerateAnchors( - const std::vector &scales, - const std::vector &ratios, - const int base_size) { - const std::vector base_anchor = - {0, 0, - static_cast(base_size-1), - static_cast(base_size-1)}; - - const size_t scales_size = scales.size(); - const size_t ratios_size = ratios.size(); - // get height, width, centers - std::vector base_window = WHCenters(base_anchor); - const float size = base_window[0] * base_window[1]; - std::vector> anchors(scales_size * ratios_size, - std::vector(4)); - -#pragma omp parallel for - for (size_t ratio_idx = 0; ratio_idx < ratios_size; ++ratio_idx) { - float ws = ::roundf(::sqrtf(size / ratios[ratio_idx])); - float hs = ::roundf(ws * ratios[ratio_idx]); - std::vector tmp_anchor(4); - tmp_anchor[0] = base_window[2] - (ws - 1) / 2; - tmp_anchor[1] = base_window[3] - (hs - 1) / 2; - tmp_anchor[2] = base_window[2] + (ws - 1) / 2; - tmp_anchor[3] = base_window[3] + (hs - 1) / 2; - auto window = WHCenters(tmp_anchor); - for (size_t scale_idx = 0; scale_idx < scales_size; ++scale_idx) { - const size_t idx = ratio_idx * scales_size + scale_idx; - ws = window[0] * scales[scale_idx]; - hs = window[1] * scales[scale_idx]; - anchors[idx][0] = window[2] - (ws - 1) / 2; - anchors[idx][1] = window[3] - (hs - 1) / 2; - anchors[idx][2] = window[2] + (ws - 1) / 2; - anchors[idx][3] = window[3] + (hs - 1) / 2; - } - } - return anchors; -} - -inline std::vector nms(const float *bboxes_ptr, - const index_t num_bboxes, - const float thresh, - const int post_nms_top_n) { - std::vector keep; - std::vector suppressed(num_bboxes, 0); - - std::vector areas(num_bboxes, 0); - for (index_t i = 0; i < num_bboxes; ++i) { - const index_t idx = (i << 2); - areas[i] = (bboxes_ptr[idx + 2] - bboxes_ptr[idx] + 1) * - (bboxes_ptr[idx + 3] - bboxes_ptr[idx + 1] + 1); - } - - for (int i = 0; i < num_bboxes; ++i) { - if (suppressed[i] == 1) continue; - keep.push_back(i); - if (keep.size() >= static_cast(post_nms_top_n)) break; - int coord_idx = i << 2; - const float x1 = bboxes_ptr[coord_idx]; - const float y1 = bboxes_ptr[coord_idx + 1]; - const float x2 = bboxes_ptr[coord_idx + 2]; - const float y2 = bboxes_ptr[coord_idx + 3]; - const float area1 = areas[i]; - for (int j = i + 1; j < num_bboxes; ++j) { - if (suppressed[j] == 1) continue; - - coord_idx = j << 2; - const float iou = - std::max(0.0, - std::min(x2, bboxes_ptr[coord_idx + 2]) - - std::max(x1, bboxes_ptr[coord_idx]) + 1) - * std::max(0.0, - std::min(y2, bboxes_ptr[coord_idx + 3]) - - std::max(y1, bboxes_ptr[coord_idx + 1]) + 1); - if ((iou / (area1 + areas[j] - iou)) >= thresh) { - suppressed[j] = 1; - } - } - } - return keep; -} - - -template -struct ProposalFunctor : OpKernel { - ProposalFunctor(OpKernelContext *context, - const int min_size, - const float nms_thresh, - const int pre_nms_top_n, - const int post_nms_top_n, - const int feat_stride, - const int base_size, - const std::vector &scales, - const std::vector &ratios) : - OpKernel(context), - min_size_(min_size), - thresh_(nms_thresh), - pre_nms_top_n_(pre_nms_top_n), - post_nms_top_n_(post_nms_top_n), - feat_stride_(feat_stride), - anchors_(GenerateAnchors(scales, ratios, base_size)) {} - - MaceStatus operator()(const Tensor *rpn_cls_prob, - const Tensor *rpn_bbox_pred, - const Tensor *img_info_tensor, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - MACE_CHECK(rpn_cls_prob->dim(1) == rpn_bbox_pred->dim(1) && - rpn_cls_prob->dim(2) == rpn_bbox_pred->dim(2)); - MACE_CHECK((rpn_cls_prob->dim(3) / 2 == rpn_bbox_pred->dim(3) / 4) && - (static_cast(rpn_cls_prob->dim(3) / 2) == anchors_.size())); - const float *img_info = img_info_tensor->data(); - const int im_height = static_cast(img_info[0] - 1); - const int im_width = static_cast(img_info[1] - 1); - const index_t feat_height = rpn_cls_prob->dim(1); - const index_t feat_width = rpn_cls_prob->dim(2); - const int anchors_size = anchors_.size(); - - // shift anchors to original input - std::vector> proposals( - anchors_size * feat_height * feat_width, - std::vector(4)); - -#pragma omp parallel for collapse(3) - for (int h_idx = 0; h_idx < feat_height; ++h_idx) { - for (int w_idx = 0; w_idx < feat_width; ++w_idx) { - for (int a_idx = 0; a_idx < anchors_size; ++a_idx) { - const int shift_h = h_idx * feat_stride_; - const int shift_w = w_idx * feat_stride_; - const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size - + a_idx; - proposals[sanc_idx][0] = anchors_[a_idx][0] + shift_w; - proposals[sanc_idx][1] = anchors_[a_idx][1] + shift_h; - proposals[sanc_idx][2] = anchors_[a_idx][2] + shift_w; - proposals[sanc_idx][3] = anchors_[a_idx][3] + shift_h; - } - } - } - // Convert anchors into proposals via bbox transformations - // 2. clip predicted boxes to image - const float *bbox_deltas = rpn_bbox_pred->data(); -#pragma omp parallel for collapse(3) - for (int h_idx = 0; h_idx < feat_height; ++h_idx) { - for (int w_idx = 0; w_idx < feat_width; ++w_idx) { - for (int a_idx = 0; a_idx < anchors_size; ++a_idx) { - const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size - + a_idx; - const float width = proposals[sanc_idx][2] - - proposals[sanc_idx][0] + 1; - const float height = proposals[sanc_idx][3] - - proposals[sanc_idx][1] + 1; - int delta_offset = sanc_idx * 4; - float pred_ctr_x = bbox_deltas[delta_offset + 0] * width + - (proposals[sanc_idx][0] + width / 2); - float pred_ctr_y = bbox_deltas[delta_offset + 1] * height + - (proposals[sanc_idx][1] + height / 2); - float pred_w = std::exp(bbox_deltas[delta_offset + 2]) * width; - float pred_h = std::exp(bbox_deltas[delta_offset + 3]) * height; - - proposals[sanc_idx][0] = std::max( - std::min(pred_ctr_x - pred_w / 2, im_width), - 0); - proposals[sanc_idx][1] = std::max( - std::min(pred_ctr_y - pred_h / 2, im_height), - 0); - proposals[sanc_idx][2] = std::max( - std::min(pred_ctr_x + pred_w / 2, im_width), - 0); - proposals[sanc_idx][3] = std::max( - std::min(pred_ctr_y + pred_h / 2, im_height), - 0); - } - } - } - // 3. remove predicted boxes with either height or width < threshold - // (NOTE: convert min_size to input image scale stored in im_info[2]) - std::vector keep; - const float min_size = min_size_ * img_info[2]; - for (int h_idx = 0; h_idx < feat_height; ++h_idx) { - for (int w_idx = 0; w_idx < feat_width; ++w_idx) { - for (int a_idx = 0; a_idx < anchors_size; ++a_idx) { - const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size - + a_idx; - const float width = proposals[sanc_idx][2] - - proposals[sanc_idx][0] + 1; - const float height = proposals[sanc_idx][3] - - proposals[sanc_idx][1] + 1; - if (width >= min_size && height >= min_size) { - keep.push_back(sanc_idx); - } - } - } - } - - // 4. sort all (proposal, score) pairs by score from highest to lowest - // 5. take top pre_nms_topN (e.g. 6000) - auto scores = rpn_cls_prob->data(); - const int scores_chan = static_cast(rpn_cls_prob->dim(3)); - - auto score_idx_func = [&](int idx) -> int { - return (idx / anchors_size) * scores_chan + - (idx % anchors_size) + anchors_size; - }; - std::sort(keep.begin(), keep.end(), [&](int left, int right) -> bool{ - return scores[score_idx_func(left)] > - scores[score_idx_func(right)]; - }); - - int size = std::min(pre_nms_top_n_, keep.size()); - std::vector nms_scores(size, 0); - std::vector nms_proposals((size << 2), 0); -#pragma omp parallel for - for (int i = 0; i < size; ++i) { - nms_scores[i] = scores[score_idx_func(keep[i])]; - nms_proposals[i << 2] = proposals[keep[i]][0]; - nms_proposals[(i << 2) + 1] = proposals[keep[i]][1]; - nms_proposals[(i << 2) + 2] = proposals[keep[i]][2]; - nms_proposals[(i << 2) + 3] = proposals[keep[i]][3]; - } - - /* 6. apply nms (e.g. threshold = 0.7) - 7. take after_nms_topN (e.g. 300) - 8. return the top proposals (-> RoIs top) */ - auto nms_result = nms(nms_proposals.data(), - nms_scores.size(), - thresh_, - post_nms_top_n_); - - // Output rois blob - // Our RPN implementation only supports a single input image, so all - // batch inds are 0 - size = static_cast(nms_result.size()); - MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5})); - auto output_ptr = output->mutable_data(); -#pragma omp parallel for - for (int i = 0; i < size; ++i) { - const int out_idx = i * 5; - const int nms_idx = nms_result[i] * 4; - output_ptr[out_idx] = 0; - output_ptr[out_idx + 1] = nms_proposals[nms_idx]; - output_ptr[out_idx + 2] = nms_proposals[nms_idx + 1]; - output_ptr[out_idx + 3] = nms_proposals[nms_idx + 2]; - output_ptr[out_idx + 4] = nms_proposals[nms_idx + 3]; - } - - return MACE_SUCCESS; - } - - const int min_size_; - const float thresh_; - const int pre_nms_top_n_; - const int post_nms_top_n_; - const int feat_stride_; - std::vector> anchors_; -}; - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_PROPOSAL_H_ diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.cc similarity index 58% rename from mace/kernels/quantize.h rename to mace/kernels/quantize.cc index 337a831618a407de3fb443e67159347196b18e33..2f2b8fc263f9f5ddeaf5aeb394f81283dddbc245 100644 --- a/mace/kernels/quantize.h +++ b/mace/kernels/quantize.cc @@ -12,34 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_QUANTIZE_H_ -#define MACE_KERNELS_QUANTIZE_H_ - #include #include #include #include -#include "mace/core/future.h" +#include "mace/core/operator.h" #include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" #include "mace/utils/quantize.h" namespace mace { namespace kernels { -template -struct QuantizeFunctor; +template +class QuantizeOp; -template<> -struct QuantizeFunctor : OpKernel { - explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {} +template <> +class QuantizeOp : public Operation { + public: + explicit QuantizeOp(OpConstructContext *context) + : Operation(context), + non_zero_( + static_cast(Operation::GetOptionalArg("non_zero", 0))) {} - MaceStatus operator()(const Tensor *input, - const bool non_zero, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); const float *input_data = input->data(); @@ -55,29 +55,34 @@ struct QuantizeFunctor : OpKernel { int32_t zero_point; Quantize(input_data, input->size(), - non_zero, + non_zero_, output_data, &scale, &zero_point); output->SetScale(scale); output->SetZeroPoint(zero_point); } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + + private: + bool non_zero_; }; -template -struct DequantizeFunctor; +template +class DequantizeOp; -template<> -struct DequantizeFunctor : OpKernel { - explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {} +template <> +class DequantizeOp : public Operation { + public: + explicit DequantizeOp(OpConstructContext *context) + : Operation(context) {} - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); const uint8_t *input_data = input->data(); @@ -87,12 +92,18 @@ struct DequantizeFunctor : OpKernel { input->scale(), input->zero_point(), output_data); - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; +void RegisterQuantize(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Quantize", QuantizeOp, + DeviceType::CPU, uint8_t); +} + +void RegisterDequantize(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp, + DeviceType::CPU, uint8_t); +} } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_QUANTIZE_H_ diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.cc similarity index 74% rename from mace/kernels/reduce_mean.h rename to mace/kernels/reduce_mean.cc index db00fd41e0c8c33a32a8389bec29c61bdea124db..d103125bb71a24729caba0139c661bf43659fdcf 100644 --- a/mace/kernels/reduce_mean.h +++ b/mace/kernels/reduce_mean.cc @@ -12,29 +12,66 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_REDUCE_MEAN_H_ -#define MACE_KERNELS_REDUCE_MEAN_H_ - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) -#include -#endif #include #include #include #include "mace/core/future.h" +#include "mace/core/operator.h" #include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/reduce_mean.h" +#endif // MACE_ENABLE_OPENCL + namespace mace { namespace kernels { -template -struct ReduceMeanFunctor : OpKernel { - ReduceMeanFunctor(OpKernelContext *context, - const std::vector &axis, - const bool keep_dims) - : OpKernel(context), axis_(axis), keep_dims_(keep_dims) {} +class ReduceMeanOpBase : public Operation { + public: + explicit ReduceMeanOpBase(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetRepeatedArgs("axis")), + keep_dims_(Operation::GetOptionalArg("keepdims", false)) { + } + + protected: + inline void Validate() { + const Tensor *input = this->Input(0); + const int left = static_cast(input->dim_size() * -1); + const int right = static_cast(input->dim_size()); + if (axis_.size()) { + for (unsigned int i = 0; i < axis_.size(); ++i) { + MACE_CHECK(axis_[i] > left && axis_[i] < right, "Axis is over range."); + } + } + } + + protected: + const std::vector axis_; + bool keep_dims_; +}; + +template +class ReduceMeanOp; + +template +class ReduceMeanOp : public ReduceMeanOpBase { + public: + explicit ReduceMeanOp(OpConstructContext *context) + : ReduceMeanOpBase(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + Validate(); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + Simplify(input); + output->Resize(out_shape_); + Compute(input, output); + return MaceStatus::MACE_SUCCESS; + } + private: void Simplify(const Tensor *input) { std::vector bitmap(static_cast(input->dim_size()), false); if (axis_.size() == 0) { @@ -190,48 +227,49 @@ struct ReduceMeanFunctor : OpKernel { } } - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - Simplify(input); - output->Resize(out_shape_); - Compute(input, output); - return MACE_SUCCESS; - } - - const std::vector axis_; - bool keep_dims_; + private: bool reduce_first_axis_; std::vector data_reshape_; std::vector out_shape_; }; #ifdef MACE_ENABLE_OPENCL -class OpenCLReduceMeanKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLReduceMeanKernel); -}; template -struct ReduceMeanFunctor : OpKernel { - ReduceMeanFunctor(OpKernelContext *context, - const std::vector &axis, - const bool keep_dims); +class ReduceMeanOp : public ReduceMeanOpBase { + public: + explicit ReduceMeanOp(OpConstructContext *context) + : ReduceMeanOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ReduceMeanKernel(axis_, keep_dims_)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + Validate(); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); + return kernel_->Compute(context, input, output); + } + private: std::unique_ptr kernel_; }; -#endif +#endif // MACE_ENABLE_OPENCL + +void RegisterReduceMean(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_REDUCE_MEAN_H_ diff --git a/mace/ops/reshape.h b/mace/kernels/reshape.cc similarity index 68% rename from mace/ops/reshape.h rename to mace/kernels/reshape.cc index 86476de06bb5cb65e55bc623218fb7f97f1e3819..2cfef42b9857269ca6306253d115d3f3e564cbba 100644 --- a/mace/ops/reshape.h +++ b/mace/kernels/reshape.cc @@ -12,24 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_RESHAPE_H_ -#define MACE_OPS_RESHAPE_H_ - #include #include "mace/core/operator.h" -#include "mace/kernels/reshape.h" namespace mace { -namespace ops { +namespace kernels { -template -class ReshapeOp : public Operator { +template +class ReshapeOp : public Operation { public: - ReshapeOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), functor_(context) {} + explicit ReshapeOp(OpConstructContext *context) + : Operation(context) {} - MaceStatus Run(StatsFuture *future) override { + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); const Tensor *input = this->Input(INPUT); const Tensor *shape = this->Input(SHAPE); const index_t num_dims = shape->dim_size() == 0 ? 0 : shape->dim(0); @@ -63,19 +60,29 @@ class ReshapeOp : public Operator { } Tensor *output = this->Output(OUTPUT); + output->ReuseTensorBuffer(*input); + output->Reshape(out_shape); - return functor_(input, out_shape, output, future); + return MaceStatus::MACE_SUCCESS; } - private: - kernels::ReshapeFunctor functor_; - private: MACE_OP_INPUT_TAGS(INPUT, SHAPE); MACE_OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterReshape(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, + DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, + DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} -#endif // MACE_OPS_RESHAPE_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/resize_bicubic.cc b/mace/kernels/resize_bicubic.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe0512ffe2e9bb331c194ee6cf9e3378db54665f --- /dev/null +++ b/mace/kernels/resize_bicubic.cc @@ -0,0 +1,234 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/resize_bicubic.h" + +#include +#include +#include + +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/resize_bicubic.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +inline const std::shared_ptr InitCoeffsTable() { + // Allocate and initialize coefficients table using Bicubic + // convolution algorithm. + // https://en.wikipedia.org/wiki/Bicubic_interpolation + auto coeffs_tab = std::shared_ptr( + new float[(resize_bicubic::kTableSize + 1) * 2], + std::default_delete()); + float *coeffs_tab_ptr = coeffs_tab.get(); + static const float A = -0.75f; + for (int i = 0; i <= resize_bicubic::kTableSize; ++i) { + float x = i * 1.0f / resize_bicubic::kTableSize; + coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1; + x += 1.0; + coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; + } + return coeffs_tab; +} + +inline const float *GetCoeffsTable() { + // Static so that we initialize it on first use + static const std::shared_ptr coeffs_tab = InitCoeffsTable(); + return coeffs_tab.get(); +} + +inline int64_t Bound(int64_t val, int64_t limit) { + return std::min(limit - 1ll, std::max(0ll, val)); +} + +inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit, + std::vector *weights, + std::vector *indices) { + auto in_loc = static_cast(scale * out_loc); + const float delta = scale * out_loc - in_loc; + const int64_t offset = lrintf(delta * resize_bicubic::kTableSize); + const float *coeffs_tab = GetCoeffsTable(); + *weights = {coeffs_tab[offset * 2 + 1], + coeffs_tab[offset * 2], + coeffs_tab[(resize_bicubic::kTableSize - offset) * 2], + coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]}; + *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit), + Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}; +} + +inline float Interpolate1D(const std::vector &weights, + const std::vector &values) { + return values[0] * weights[0] + values[1] * weights[1] + + values[2] * weights[2] + values[3] * weights[3]; +} + +inline void ResizeImage(const float *images, + const index_t batch_size, + const index_t in_height, + const index_t in_width, + const index_t out_height, + const index_t out_width, + const index_t channels, + const float height_scale, + const float width_scale, + float *output) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch_size; ++b) { + for (index_t y = 0; y < out_height; ++y) { + std::vector y_weights; + std::vector y_indices; + GetWeightsAndIndices(height_scale, y, in_height, &y_weights, + &y_indices); + for (index_t x = 0; x < out_width; ++x) { + std::vector x_weights; + std::vector x_indices; + GetWeightsAndIndices(width_scale, x, in_width, &x_weights, + &x_indices); + + for (index_t c = 0; c < channels; ++c) { + // Use a 4x4 patch to compute the interpolated output value at + // (b, y, x, c). + const float *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + float *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; + std::vector coeff(4, 0.0); + for (index_t i = 0; i < 4; ++i) { + const std::vector values = { + channel_input_ptr[y_indices[i] * in_width + x_indices[0]], + channel_input_ptr[y_indices[i] * in_width + x_indices[1]], + channel_input_ptr[y_indices[i] * in_width + x_indices[2]], + channel_input_ptr[y_indices[i] * in_width + x_indices[3]]}; + coeff[i] = Interpolate1D(x_weights, values); + } + channel_output_ptr[y * out_width + x] = + Interpolate1D(y_weights, coeff); + } + } + } + } +} + +template +class ResizeBicubicOp; + +template <> +class ResizeBicubicOp : public Operation { + public: + explicit ResizeBicubicOp(OpConstructContext *context) + : Operation(context), + align_corners_(Operation::GetOptionalArg("align_corners", false)), + size_(Operation::GetRepeatedArgs("size", {-1, -1})) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(size_.size() == 2); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", + input->dim_size()); + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + + index_t out_height = size_[0]; + index_t out_width = size_[1]; + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector out_shape{batch, channels, out_height, out_width}; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + if (out_height == in_height && out_width == in_width) { + std::copy(input_data, + input_data + batch * channels * in_height * in_width, + output_data); + return MaceStatus::MACE_SUCCESS; + } + + float height_scale = + resize_bicubic::CalculateResizeScale(in_height, + out_height, + align_corners_); + float width_scale = + resize_bicubic::CalculateResizeScale(in_width, + out_width, + align_corners_); + + ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, + channels, height_scale, width_scale, output_data); + + return MaceStatus::MACE_SUCCESS; + } + + private: + bool align_corners_; + std::vector size_; +}; + +#ifdef MACE_ENABLE_OPENCL +template +class ResizeBicubicOp : public Operation { + public: + explicit ResizeBicubicOp(OpConstructContext *context) + : Operation(context) { + bool align_corners = Operation::GetOptionalArg( + "align_corners", false); + std::vector size = Operation::GetRepeatedArgs( + "size", {-1, -1}); + MACE_CHECK(size.size() == 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ResizeBicubicKernel(align_corners, + size[0], + size[1])); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", + input->dim_size()); + + return kernel_->Compute(context, input, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterResizeBicubic(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h index a33e0549fe73035bc866af5384609f72781c297b..5e02edd413b2b25b0f4eeb73cd3f76cfa9052343 100644 --- a/mace/kernels/resize_bicubic.h +++ b/mace/kernels/resize_bicubic.h @@ -15,68 +15,12 @@ #ifndef MACE_KERNELS_RESIZE_BICUBIC_H_ #define MACE_KERNELS_RESIZE_BICUBIC_H_ -#include -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/utils/logging.h" +#include "mace/core/types.h" namespace mace { namespace kernels { - -static const int64_t kTableSize = (1 << 10); - -inline const std::shared_ptr InitCoeffsTable() { - // Allocate and initialize coefficients table using Bicubic - // convolution algorithm. - // https://en.wikipedia.org/wiki/Bicubic_interpolation - auto coeffs_tab = std::shared_ptr(new float[(kTableSize + 1) * 2], - std::default_delete()); - float *coeffs_tab_ptr = coeffs_tab.get(); - static const double A = -0.75; - for (int i = 0; i <= kTableSize; ++i) { - float x = i * 1.0 / kTableSize; - coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1; - x += 1.0; - coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; - } - return coeffs_tab; -} - -inline const float *GetCoeffsTable() { - // Static so that we initialize it on first use - static const std::shared_ptr coeffs_tab = InitCoeffsTable(); - return coeffs_tab.get(); -} - -inline int64_t Bound(int64_t val, int64_t limit) { - return std::min(limit - 1ll, std::max(0ll, val)); -} - -inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit, - std::vector *weights, - std::vector *indices) { - const int64_t in_loc = scale * out_loc; - const float delta = scale * out_loc - in_loc; - const int64_t offset = lrintf(delta * kTableSize); - const float *coeffs_tab = GetCoeffsTable(); - *weights = {coeffs_tab[offset * 2 + 1], - coeffs_tab[offset * 2], - coeffs_tab[(kTableSize - offset) * 2], - coeffs_tab[(kTableSize - offset) * 2 + 1]}; - *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit), - Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}; -} - -inline float Interpolate1D(const std::vector &weights, - const std::vector &values) { - return values[0] * weights[0] + values[1] * weights[1] + - values[2] * weights[2] + values[3] * weights[3]; -} +namespace resize_bicubic { +constexpr int64_t kTableSize = (1u << 10); inline float CalculateResizeScale(index_t in_size, index_t out_size, @@ -85,140 +29,7 @@ inline float CalculateResizeScale(index_t in_size, ? (in_size - 1) / static_cast(out_size - 1) : in_size / static_cast(out_size); } - -inline void ResizeImage(const float *images, - const index_t batch_size, - const index_t in_height, - const index_t in_width, - const index_t out_height, - const index_t out_width, - const index_t channels, - const float height_scale, - const float width_scale, - float *output) { -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch_size; ++b) { - for (index_t y = 0; y < out_height; ++y) { - std::vector y_weights; - std::vector y_indices; - GetWeightsAndIndices(height_scale, y, in_height, &y_weights, - &y_indices); - for (index_t x = 0; x < out_width; ++x) { - std::vector x_weights; - std::vector x_indices; - GetWeightsAndIndices(width_scale, x, in_width, &x_weights, - &x_indices); - - for (index_t c = 0; c < channels; ++c) { - // Use a 4x4 patch to compute the interpolated output value at - // (b, y, x, c). - const float *channel_input_ptr = - images + (b * channels + c) * in_height * in_width; - float *channel_output_ptr = - output + (b * channels + c) * out_height * out_width; - std::vector coeff(4, 0.0); - for (index_t i = 0; i < 4; ++i) { - const std::vector values = { - static_cast(channel_input_ptr - [y_indices[i] * in_width + x_indices[0]]), - static_cast(channel_input_ptr - [y_indices[i] * in_width + x_indices[1]]), - static_cast(channel_input_ptr - [y_indices[i] * in_width + x_indices[2]]), - static_cast(channel_input_ptr - [y_indices[i] * in_width + x_indices[3]])}; - coeff[i] = Interpolate1D(x_weights, values); - } - channel_output_ptr[y * out_width + x] = - Interpolate1D(y_weights, coeff); - } - } - } - } -} - -template -struct ResizeBicubicFunctor; - -template<> -struct ResizeBicubicFunctor : OpKernel { - ResizeBicubicFunctor(OpKernelContext *context, - const bool align_corners, - const std::vector &size) - : OpKernel(context), - align_corners_(align_corners) { - MACE_CHECK(size.size() == 2); - out_height_ = size[0]; - out_width_ = size[1]; - } - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t in_height = input->dim(2); - const index_t in_width = input->dim(3); - - index_t out_height = out_height_; - index_t out_width = out_width_; - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector out_shape{batch, channels, out_height, out_width}; - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard output_mapper(output); - const float *input_data = input->data(); - float *output_data = output->mutable_data(); - - if (out_height == in_height && out_width == in_width) { - std::copy(input_data, - input_data + batch * channels * in_height * in_width, - output_data); - return MACE_SUCCESS; - } - - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); - - ResizeImage(input_data, batch, in_height, in_width, out_height, out_width, - channels, height_scale, width_scale, output_data); - - return MACE_SUCCESS; - } - - bool align_corners_; - index_t out_height_; - index_t out_width_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLResizeBicubicKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBicubicKernel); -}; -template -struct ResizeBicubicFunctor - : OpKernel { - ResizeBicubicFunctor(OpKernelContext *context, - bool align_corners, - const std::vector &size); - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL +} // namespace resize_bicubic } // namespace kernels } // namespace mace diff --git a/mace/kernels/resize_bilinear.cc b/mace/kernels/resize_bilinear.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ea86158fc574bcce5e98e30763ce51e3afef0fd --- /dev/null +++ b/mace/kernels/resize_bilinear.cc @@ -0,0 +1,371 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/kernels/resize_bilinear.h" + +#include +#include +#include + +#include "mace/core/operator.h" +#include "mace/utils/quantize.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/resize_bilinear.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace kernels { + +struct CachedInterpolation { + index_t lower; // Lower source index used in the interpolation + index_t upper; // Upper source index used in the interpolation + // 1-D linear iterpolation scale (see: + // https://en.wikipedia.org/wiki/Bilinear_interpolation) + float lerp; +}; + +inline void ComputeInterpolationWeights( + const index_t out_size, + const index_t in_size, + const float scale, + CachedInterpolation *interpolation) { + interpolation[out_size].lower = 0; + interpolation[out_size].upper = 0; + for (index_t i = out_size - 1; i >= 0; --i) { + const float in = i * scale; + interpolation[i].lower = static_cast(in); + interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); + interpolation[i].lerp = in - interpolation[i].lower; + } +} + +template +inline T ComputeLerp(const T top_left, + const T top_right, + const T bottom_left, + const T bottom_right, + const float x_lerp, + const float y_lerp); + +template <> +inline float ComputeLerp(const float top_left, + const float top_right, + const float bottom_left, + const float bottom_right, + const float x_lerp, + const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return top + (bottom - top) * y_lerp; +} + +template <> +inline uint8_t ComputeLerp(const uint8_t top_left, + const uint8_t top_right, + const uint8_t bottom_left, + const uint8_t bottom_right, + const float x_lerp, + const float y_lerp) { + const float top = top_left + (top_right - top_left) * x_lerp; + const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; + return Saturate(roundf(top + (bottom - top) * y_lerp)); +} + +template +inline void ResizeImageNCHW(const T *images, + const index_t batch_size, + const index_t in_height, + const index_t in_width, + const index_t out_height, + const index_t out_width, + const index_t channels, + const std::vector &xs_vec, + const std::vector &ys, + T *output) { + const CachedInterpolation *xs = xs_vec.data(); + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch_size; ++b) { + for (index_t c = 0; c < channels; ++c) { + const T + *channel_input_ptr = + images + (b * channels + c) * in_height * in_width; + T *channel_output_ptr = + output + (b * channels + c) * out_height * out_width; + for (index_t y = 0; y < out_height; ++y) { + const T *y_lower_input_ptr = + channel_input_ptr + ys[y].lower * in_width; + const T *y_upper_input_ptr = + channel_input_ptr + ys[y].upper * in_width; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T top_left = y_lower_input_ptr[xs[x].lower]; + const T top_right = y_lower_input_ptr[xs[x].upper]; + const T bottom_left = y_upper_input_ptr[xs[x].lower]; + const T bottom_right = y_upper_input_ptr[xs[x].upper]; + channel_output_ptr[y * out_width + x] = + ComputeLerp(top_left, top_right, bottom_left, + bottom_right, xs_lerp, ys_lerp); + } + } + } + } +} + +template +inline void ResizeImageNHWC(const T *images, + const index_t batch_size, + const index_t in_height, + const index_t in_width, + const index_t out_height, + const index_t out_width, + const index_t channels, + const std::vector &xs_vec, + const std::vector &ys, + T *output) { + const CachedInterpolation *xs = xs_vec.data(); + + for (index_t b = 0; b < batch_size; ++b) { + const T *input_base = images + b * channels * in_height * in_width; + T *output_base = output + b * channels * out_height * out_width; +#pragma omp parallel for + for (index_t y = 0; y < out_height; ++y) { + const T + *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; + const T + *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels; + const float ys_lerp = ys[y].lerp; + + for (index_t x = 0; x < out_width; ++x) { + const float xs_lerp = xs[x].lerp; + const T *top_left = y_lower_input_ptr + xs[x].lower * channels; + const T *top_right = y_lower_input_ptr + xs[x].upper * channels; + const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels; + const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels; + + T *output_ptr = output_base + (y * out_width + x) * channels; + for (index_t c = 0; c < channels; ++c) { + output_ptr[c] = + ComputeLerp(top_left[c], top_right[c], bottom_left[c], + bottom_right[c], xs_lerp, ys_lerp); + } + } + } + } +} + +template +class ResizeBilinearOp; + +template +class ResizeBilinearOp : public Operation { + public: + explicit ResizeBilinearOp(OpConstructContext *context) + : Operation(context), + align_corners_(Operation::GetOptionalArg("align_corners", false)), + size_(Operation::GetRepeatedArgs("size", {-1, -1})) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(size_.size() == 2); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", + input->dim_size()); + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t in_height = input->dim(2); + const index_t in_width = input->dim(3); + + index_t out_height = size_[0]; + index_t out_width = size_[1]; + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector out_shape{batch, channels, out_height, out_width}; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + + if (out_height == in_height && out_width == in_width) { + std::copy(input_data, + input_data + batch * channels * in_height * in_width, + output_data); + return MaceStatus::MACE_SUCCESS; + } + + float height_scale = + resize_bilinear::CalculateResizeScale(in_height, + out_height, + align_corners_); + float width_scale = + resize_bilinear::CalculateResizeScale(in_width, + out_width, + align_corners_); + + std::vector ys(out_height + 1); + std::vector xs(out_width + 1); + + // Compute the cached interpolation weights on the x and y dimensions. + ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); + ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); + + ResizeImageNCHW(input_data, + batch, + in_height, + in_width, + out_height, + out_width, + channels, + xs, + ys, + output_data); + + return MaceStatus::MACE_SUCCESS; + } + + private: + bool align_corners_; + std::vector size_; +}; + +template <> +class ResizeBilinearOp : public Operation { + public: + explicit ResizeBilinearOp(OpConstructContext *context) + : Operation(context), + align_corners_(Operation::GetOptionalArg("align_corners", false)), + size_(Operation::GetRepeatedArgs("size", {-1, -1})) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(size_.size() == 2); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", + input->dim_size()); + const index_t batch = input->dim(0); + const index_t in_height = input->dim(1); + const index_t in_width = input->dim(2); + const index_t channels = input->dim(3); + + index_t out_height = size_[0]; + index_t out_width = size_[1]; + MACE_CHECK(out_height > 0 && out_width > 0); + std::vector out_shape{batch, out_height, out_width, channels}; + MACE_RETURN_IF_ERROR(output->Resize(out_shape)); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + const uint8_t *input_data = input->data(); + uint8_t *output_data = output->mutable_data(); + + if (out_height == in_height && out_width == in_width) { + std::copy(input_data, + input_data + batch * in_height * in_width * channels , + output_data); + return MaceStatus::MACE_SUCCESS; + } + + float height_scale = + resize_bilinear::CalculateResizeScale(in_height, + out_height, + align_corners_); + float width_scale = + resize_bilinear::CalculateResizeScale(in_width, + out_width, + align_corners_); + + std::vector ys(out_height + 1); + std::vector xs(out_width + 1); + + // Compute the cached interpolation weights on the x and y dimensions. + ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); + ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); + + ResizeImageNHWC(input_data, + batch, + in_height, + in_width, + out_height, + out_width, + channels, + xs, + ys, + output_data); + + return MaceStatus::MACE_SUCCESS; + } + + private: + bool align_corners_; + std::vector size_; +}; + +#ifdef MACE_ENABLE_OPENCL +template +class ResizeBilinearOp : public Operation { + public: + explicit ResizeBilinearOp(OpConstructContext *context) + : Operation(context) { + bool align_corners = Operation::GetOptionalArg( + "align_corners", false); + std::vector size = Operation::GetRepeatedArgs( + "size", {-1, -1}); + MACE_CHECK(size.size() == 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::ResizeBilinearKernel(align_corners, + size[0], + size[1])); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", + input->dim_size()); + + return kernel_->Compute(context, input, output); + } + + private: + std::unique_ptr kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +void RegisterResizeBilinear(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, + DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index ea3f7aa35b9f3848f93802f71c245898991a9b55..1f94e50007f8ead8c6816f4b58aaeed08d9f3bce 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -15,26 +15,11 @@ #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_ #define MACE_KERNELS_RESIZE_BILINEAR_H_ -#include -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/utils/quantize.h" +#include "mace/core/types.h" namespace mace { namespace kernels { - -struct CachedInterpolation { - index_t lower; // Lower source index used in the interpolation - index_t upper; // Upper source index used in the interpolation - // 1-D linear iterpolation scale (see: - // https://en.wikipedia.org/wiki/Bilinear_interpolation) - float lerp; -}; - +namespace resize_bilinear { inline float CalculateResizeScale(index_t in_size, index_t out_size, bool align_corners) { @@ -42,302 +27,7 @@ inline float CalculateResizeScale(index_t in_size, ? (in_size - 1) / static_cast(out_size - 1) : in_size / static_cast(out_size); } - -inline void ComputeInterpolationWeights( - const index_t out_size, - const index_t in_size, - const float scale, - CachedInterpolation *interpolation) { - interpolation[out_size].lower = 0; - interpolation[out_size].upper = 0; - for (index_t i = out_size - 1; i >= 0; --i) { - const float in = i * scale; - interpolation[i].lower = static_cast(in); - interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1); - interpolation[i].lerp = in - interpolation[i].lower; - } -} - -template -inline T ComputeLerp(const T top_left, - const T top_right, - const T bottom_left, - const T bottom_right, - const float x_lerp, - const float y_lerp); - -template <> -inline float ComputeLerp(const float top_left, - const float top_right, - const float bottom_left, - const float bottom_right, - const float x_lerp, - const float y_lerp) { - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; - return top + (bottom - top) * y_lerp; -} - -template <> -inline uint8_t ComputeLerp(const uint8_t top_left, - const uint8_t top_right, - const uint8_t bottom_left, - const uint8_t bottom_right, - const float x_lerp, - const float y_lerp) { - const float top = top_left + (top_right - top_left) * x_lerp; - const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp; - return Saturate(roundf(top + (bottom - top) * y_lerp)); -} - -template -inline void ResizeImageNCHW(const T *images, - const index_t batch_size, - const index_t in_height, - const index_t in_width, - const index_t out_height, - const index_t out_width, - const index_t channels, - const std::vector &xs_vec, - const std::vector &ys, - T *output) { - const CachedInterpolation *xs = xs_vec.data(); - -#pragma omp parallel for collapse(2) - for (index_t b = 0; b < batch_size; ++b) { - for (index_t c = 0; c < channels; ++c) { - const T - *channel_input_ptr = - images + (b * channels + c) * in_height * in_width; - T *channel_output_ptr = - output + (b * channels + c) * out_height * out_width; - for (index_t y = 0; y < out_height; ++y) { - const T *y_lower_input_ptr = - channel_input_ptr + ys[y].lower * in_width; - const T *y_upper_input_ptr = - channel_input_ptr + ys[y].upper * in_width; - const float ys_lerp = ys[y].lerp; - - for (index_t x = 0; x < out_width; ++x) { - const float xs_lerp = xs[x].lerp; - const T top_left = y_lower_input_ptr[xs[x].lower]; - const T top_right = y_lower_input_ptr[xs[x].upper]; - const T bottom_left = y_upper_input_ptr[xs[x].lower]; - const T bottom_right = y_upper_input_ptr[xs[x].upper]; - channel_output_ptr[y * out_width + x] = - ComputeLerp(top_left, top_right, bottom_left, - bottom_right, xs_lerp, ys_lerp); - } - } - } - } -} - -template -inline void ResizeImageNHWC(const T *images, - const index_t batch_size, - const index_t in_height, - const index_t in_width, - const index_t out_height, - const index_t out_width, - const index_t channels, - const std::vector &xs_vec, - const std::vector &ys, - T *output) { - const CachedInterpolation *xs = xs_vec.data(); - - for (index_t b = 0; b < batch_size; ++b) { - const T *input_base = images + b * channels * in_height * in_width; - T *output_base = output + b * channels * out_height * out_width; -#pragma omp parallel for - for (index_t y = 0; y < out_height; ++y) { - const T - *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; - const T - *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels; - const float ys_lerp = ys[y].lerp; - - for (index_t x = 0; x < out_width; ++x) { - const float xs_lerp = xs[x].lerp; - const T *top_left = y_lower_input_ptr + xs[x].lower * channels; - const T *top_right = y_lower_input_ptr + xs[x].upper * channels; - const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels; - const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels; - - T *output_ptr = output_base + (y * out_width + x) * channels; - for (index_t c = 0; c < channels; ++c) { - output_ptr[c] = - ComputeLerp(top_left[c], top_right[c], bottom_left[c], - bottom_right[c], xs_lerp, ys_lerp); - } - } - } - } -} - -template -struct ResizeBilinearFunctor : OpKernel { - ResizeBilinearFunctor(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : OpKernel(context), align_corners_(align_corners) { - MACE_CHECK(size.size() == 2); - out_height_ = size[0]; - out_width_ = size[1]; - } - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - const index_t batch = input->dim(0); - const index_t channels = input->dim(1); - const index_t in_height = input->dim(2); - const index_t in_width = input->dim(3); - - index_t out_height = out_height_; - index_t out_width = out_width_; - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector out_shape{batch, channels, out_height, out_width}; - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard output_mapper(output); - const T *input_data = input->data(); - T *output_data = output->mutable_data(); - - if (out_height == in_height && out_width == in_width) { - std::copy(input_data, - input_data + batch * channels * in_height * in_width, - output_data); - return MACE_SUCCESS; - } - - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); - - std::vector ys(out_height + 1); - std::vector xs(out_width + 1); - - // Compute the cached interpolation weights on the x and y dimensions. - ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); - ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); - - ResizeImageNCHW(input_data, - batch, - in_height, - in_width, - out_height, - out_width, - channels, - xs, - ys, - output_data); - - return MACE_SUCCESS; - } - - bool align_corners_; - index_t out_height_; - index_t out_width_; -}; - -template -struct ResizeBilinearFunctor : OpKernel { - ResizeBilinearFunctor(OpKernelContext *context, - const std::vector &size, - bool align_corners) - : OpKernel(context), align_corners_(align_corners) { - MACE_CHECK(size.size() == 2); - out_height_ = size[0]; - out_width_ = size[1]; - } - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - const index_t batch = input->dim(0); - const index_t in_height = input->dim(1); - const index_t in_width = input->dim(2); - const index_t channels = input->dim(3); - - index_t out_height = out_height_; - index_t out_width = out_width_; - MACE_CHECK(out_height > 0 && out_width > 0); - std::vector out_shape{batch, out_height, out_width, channels}; - MACE_RETURN_IF_ERROR(output->Resize(out_shape)); - - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard output_mapper(output); - const uint8_t *input_data = input->data(); - uint8_t *output_data = output->mutable_data(); - - if (out_height == in_height && out_width == in_width) { - std::copy(input_data, - input_data + batch * in_height * in_width * channels , - output_data); - return MACE_SUCCESS; - } - - float height_scale = - CalculateResizeScale(in_height, out_height, align_corners_); - float width_scale = - CalculateResizeScale(in_width, out_width, align_corners_); - - std::vector ys(out_height + 1); - std::vector xs(out_width + 1); - - // Compute the cached interpolation weights on the x and y dimensions. - ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data()); - ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data()); - - ResizeImageNHWC(input_data, - batch, - in_height, - in_width, - out_height, - out_width, - channels, - xs, - ys, - output_data); - - return MACE_SUCCESS; - } - - bool align_corners_; - index_t out_height_; - index_t out_width_; -}; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLResizeBilinearKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBilinearKernel); -}; -template -struct ResizeBilinearFunctor - : OpKernel { - ResizeBilinearFunctor(OpKernelContext *context, - const std::vector &size, - bool align_corners); - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - +} // namespace resize_bilinear } // namespace kernels } // namespace mace diff --git a/mace/kernels/reverse.h b/mace/kernels/reverse.cc similarity index 73% rename from mace/kernels/reverse.h rename to mace/kernels/reverse.cc index 69d5fd6def191ad45ea7f897cf0c87a519379af3..f73db418adcc8fa4b2a001c3b24ffd9c5d70c3f5 100644 --- a/mace/kernels/reverse.h +++ b/mace/kernels/reverse.cc @@ -12,38 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_REVERSE_H_ -#define MACE_KERNELS_REVERSE_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" - -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif // MACE_ENABLE_OPENCL +#include "mace/core/operator.h" namespace mace { namespace kernels { template -struct ReverseFunctor; +class ReverseOp; template -struct ReverseFunctor : OpKernel { - explicit ReverseFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - const Tensor *axis, - Tensor *output, - StatsFuture *future) { +class ReverseOp : public Operation { + public: + explicit ReverseOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(INPUT); + const Tensor *axis = this->Input(AXIS); + Tensor *output = this->Output(OUTPUT); + MACE_CHECK(axis->dim_size() == 1, "Only support reverse in one axis now"); const int32_t *axis_data = axis->data(); const index_t reverse_dim = *axis_data >= 0 ? - *axis_data : *axis_data + input->dim_size(); + *axis_data : *axis_data + input->dim_size(); MACE_CHECK(reverse_dim >= 0 && reverse_dim < input->dim_size(), "axis must be in the range [-rank(input), rank(input))"); @@ -71,13 +64,18 @@ struct ReverseFunctor : OpKernel { input_idx += low_dim_elem_size; } } - - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + + private: + MACE_OP_INPUT_TAGS(INPUT, AXIS); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; +void RegisterReverse(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp, + DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_REVERSE_H_ diff --git a/mace/kernels/scalar_math.h b/mace/kernels/scalar_math.cc similarity index 72% rename from mace/kernels/scalar_math.h rename to mace/kernels/scalar_math.cc index 928a4954b0b203ea2c338d07713ae411ea91dd17..f9f4822a01aaa234c08fafb7ec16cda9bcc43029 100644 --- a/mace/kernels/scalar_math.h +++ b/mace/kernels/scalar_math.cc @@ -12,15 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SCALAR_MATH_H_ -#define MACE_KERNELS_SCALAR_MATH_H_ - #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" #include "mace/kernels/eltwise.h" namespace mace { @@ -89,23 +84,22 @@ void ScalarEltwise(const T* in0, template -struct ScalarMathFunctor : OpKernel { - ScalarMathFunctor(OpKernelContext *context, - const EltwiseType type, - const std::vector &coeff, - const float scalar_input, - const int32_t scalar_input_index) - : OpKernel(context), - type_(type), - coeff_(coeff), - scalar_input_(scalar_input), - scalar_input_index_(scalar_input_index) {} +class ScalarMathOp : public Operation { + public: + explicit ScalarMathOp(OpConstructContext *context) + : Operation(context), + type_(static_cast(Operation::GetOptionalArg( + "type", static_cast(kernels::EltwiseType::NONE)))), + coeff_(Operation::GetRepeatedArgs("coeff")), + scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), + scalar_input_index_(Operation::GetOptionalArg( + "scalar_input_index", 1)) {} - MaceStatus operator()(const std::vector &inputs, - Tensor *output, - StatsFuture *future) { - const Tensor* input0 = inputs[0]; - const Tensor* input1 = (inputs.size() >= 2) ? inputs[1] : nullptr; + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + Tensor *output = this->Output(0); + const Tensor* input0 = inputs_[0]; + const Tensor* input1 = (inputs_.size() >= 2) ? inputs_[1] : nullptr; MACE_CHECK(input0->dim_size() <= 1 && input0->size() == 1, "not support input dim size") << input0->dim_size(); Tensor::MappingGuard in0_guard(input0); @@ -143,18 +137,28 @@ struct ScalarMathFunctor : OpKernel { swapped, out); } - - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: EltwiseType type_; std::vector coeff_; float scalar_input_; int32_t scalar_input_index_; }; +void RegisterScalarMath(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, + DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, + DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp, + DeviceType::GPU, int32_t); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SCALAR_MATH_H_ diff --git a/mace/ops/shape.h b/mace/kernels/shape.cc similarity index 70% rename from mace/ops/shape.h rename to mace/kernels/shape.cc index abb9ffb3197bf53c46881e53bc01c3f4c072bae3..1775f0a0dd7f63ae865743f0a347638a30685460 100644 --- a/mace/ops/shape.h +++ b/mace/kernels/shape.cc @@ -12,23 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_SHAPE_H_ -#define MACE_OPS_SHAPE_H_ - -#include - #include "mace/core/operator.h" namespace mace { -namespace ops { +namespace kernels { template -class ShapeOp : public Operator { +class ShapeOp : public Operation { public: - ShapeOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context) {} + explicit ShapeOp(OpConstructContext *context) + : Operation(context) {} - MaceStatus Run(StatsFuture *future) override { + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); if (input->dim_size() > 0) { @@ -40,7 +36,7 @@ class ShapeOp : public Operator { int32_t *output_data = output->mutable_data(); const int data_format = - OperatorBase::GetOptionalArg("data_format", 0); + Operation::GetOptionalArg("data_format", 0); if (input->dim_size() == 4 && D == DeviceType::CPU && data_format == DataFormat::NCHW) { @@ -54,9 +50,8 @@ class ShapeOp : public Operator { output_data[i] = static_cast(input->dim(i)); } } - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } private: @@ -64,7 +59,16 @@ class ShapeOp : public Operator { MACE_OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterShape(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, + DeviceType::CPU, float); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, + DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "Shape", ShapeOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} -#endif // MACE_OPS_SHAPE_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.cc similarity index 86% rename from mace/kernels/softmax.h rename to mace/kernels/softmax.cc index 6afca75ac4ec758e2e8cf88309e9f99e9d95698c..1ac3ab4d989f6116112904a72d28a6e7a255fb43 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.cc @@ -12,37 +12,36 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SOFTMAX_H_ -#define MACE_KERNELS_SOFTMAX_H_ - #include -#include +#include #include #include -#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/core/operator.h" #include "mace/kernels/fixpoint.h" #include "mace/kernels/gemmlowp_util.h" -#include "mace/kernels/kernel.h" -#include "mace/kernels/quantize.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/softmax.h" +#include "mace/kernels/opencl/buffer/softmax.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct SoftmaxFunctor; +template +class SoftmaxOp; -template<> -struct SoftmaxFunctor : OpKernel { - explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class SoftmaxOp : public Operation { + public: + explicit SoftmaxOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); const float *input_data = input->data(); @@ -116,21 +115,24 @@ struct SoftmaxFunctor : OpKernel { } else { MACE_NOT_IMPLEMENTED; } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; static const int kInputDeltaIntBits = 6; static const int kSumExpIntBits = 12; -template<> -struct SoftmaxFunctor : OpKernel { - explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {} - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class SoftmaxOp : public Operation { + public: + explicit SoftmaxOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); // Ignore range stat, fix range to [0, 1]. For large depth, each softmax // output may be too small (<<1), which causes precision issue. But it is // fine when doing classification inference. @@ -186,7 +188,7 @@ struct SoftmaxFunctor : OpKernel { output_ptr[d] = static_cast(output_f * 255); } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } int32_t scale_q = static_cast(std::min( @@ -346,33 +348,51 @@ struct SoftmaxFunctor : OpKernel { } } } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; #ifdef MACE_ENABLE_OPENCL -class OpenCLSoftmaxKernel { +template +class SoftmaxOp : public Operation { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *logits, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSoftmaxKernel); -}; -template -struct SoftmaxFunctor : OpKernel { - explicit SoftmaxFunctor(OpKernelContext *context); - MaceStatus operator()(const Tensor *logits, - Tensor *output, - StatsFuture *future); + explicit SoftmaxOp(OpConstructContext *context) + : Operation(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SoftmaxKernel); + } else { + kernel_.reset(new opencl::buffer::SoftmaxKernel); + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + + return kernel_->Compute(context, input, output); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterSoftmax(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SOFTMAX_H_ diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.cc similarity index 79% rename from mace/kernels/space_to_batch.h rename to mace/kernels/space_to_batch.cc index 337baefcca9d1ea3ff231e3652e467473b89578f..41c731c50d6f3a62cb09cabd730686175a1d0cbf 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.cc @@ -12,33 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_ -#define MACE_KERNELS_SPACE_TO_BATCH_H_ - -#include -#include #include +#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/space_to_batch.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -struct SpaceToBatchFunctorBase : OpKernel { - SpaceToBatchFunctorBase(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : OpKernel(context), - paddings_(paddings.begin(), paddings.end()), - block_shape_(block_shape.begin(), block_shape.end()) { +class SpaceToBatchOpBase : public Operation { + public: + explicit SpaceToBatchOpBase(OpConstructContext *context) + : Operation(context), + paddings_(Operation::GetRepeatedArgs("paddings", {0, 0, 0, 0})), + block_shape_(Operation::GetRepeatedArgs("block_shape", {1, 1})) { MACE_CHECK( - block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1, + block_shape_.size() == 2 && block_shape_[0] > 1 && block_shape_[1] > 1, "Block's shape should be 1D, and greater than 1"); - MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D"); + MACE_CHECK(paddings_.size() == 4, "Paddings' shape should be 2D"); } + protected: std::vector paddings_; std::vector block_shape_; @@ -88,21 +85,19 @@ struct SpaceToBatchFunctorBase : OpKernel { } }; -template -struct SpaceToBatchFunctor; - -template<> -struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : SpaceToBatchFunctorBase(context, paddings, block_shape) {} +template +class SpaceToBatchNDOp; - MaceStatus operator()(const Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class SpaceToBatchNDOp : public SpaceToBatchOpBase { + public: + explicit SpaceToBatchNDOp(OpConstructContext *context) + : SpaceToBatchOpBase(context) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *space_tensor = this->Input(0); + Tensor *batch_tensor = this->Output(0); std::vector output_shape(4, 0); CalculateSpaceToBatchOutputShape(space_tensor, @@ -197,22 +192,21 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { } // b } // block_h } // c - return MACE_SUCCESS; + + return MaceStatus::MACE_SUCCESS; } }; -template<> -struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape) - : SpaceToBatchFunctorBase(context, paddings, block_shape) {} - - MaceStatus operator()(const Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future) { - MACE_UNUSED(future); +template <> +class SpaceToBatchNDOp : public SpaceToBatchOpBase { + public: + explicit SpaceToBatchNDOp(OpConstructContext *context) + : SpaceToBatchOpBase(context) {} + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *space_tensor = this->Input(0); + Tensor *batch_tensor = this->Output(0); std::vector output_shape(4, 0); CalculateSpaceToBatchOutputShape(space_tensor, @@ -302,38 +296,52 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { * sizeof(uint8_t)); } // b - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } }; #ifdef MACE_ENABLE_OPENCL -class OpenCLSpaceToBatchKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *space_tensor, - const std::vector &paddings, - const std::vector &block_shape, - const std::vector &output_shape, - Tensor *batch_tensor, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToBatchKernel); -}; template -struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { - SpaceToBatchFunctor(OpKernelContext *context, - const std::vector &paddings, - const std::vector &block_shape); - - MaceStatus operator()(const Tensor *space_tensor, - Tensor *batch_tensor, - StatsFuture *future); +class SpaceToBatchNDOp : public SpaceToBatchOpBase { + public: + explicit SpaceToBatchNDOp(OpConstructContext *context) + : SpaceToBatchOpBase(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SpaceToBatchKernel); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *space_tensor = this->Input(0); + Tensor *batch_tensor = this->Output(0); + std::vector output_shape(4, 0); + CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC, + output_shape.data()); + return kernel_->Compute(context, space_tensor, paddings_, block_shape_, + output_shape, batch_tensor); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL +void RegisterSpaceToBatchND(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "SpaceToBatchND", + SpaceToBatchNDOp, DeviceType::CPU, float); + + MACE_REGISTER_OP(op_registry, "SpaceToBatchND", + SpaceToBatchNDOp, DeviceType::CPU, uint8_t); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "SpaceToBatchND", + SpaceToBatchNDOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "SpaceToBatchND", + SpaceToBatchNDOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SPACE_TO_BATCH_H_ diff --git a/mace/kernels/space_to_depth.h b/mace/kernels/space_to_depth.cc similarity index 60% rename from mace/kernels/space_to_depth.h rename to mace/kernels/space_to_depth.cc index 2f379bbf8bf34cf59a62d57826bca92b42ffe4bb..e2e302e69543e5c06f7736d06ee19f1e2418f60d 100644 --- a/mace/kernels/space_to_depth.h +++ b/mace/kernels/space_to_depth.cc @@ -12,28 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_ -#define MACE_KERNELS_SPACE_TO_DEPTH_H_ #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/public/mace.h" -#include "mace/kernels/kernel.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/space_to_depth.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct SpaceToDepthOpFunctor : OpKernel { - SpaceToDepthOpFunctor(OpKernelContext *context, - const int block_size) - : OpKernel(context), block_size_(block_size) {} - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); +template +class SpaceToDepthOp : public Operation { + public: + explicit SpaceToDepthOp(OpConstructContext *context) + : Operation(context), + block_size_(Operation::GetOptionalArg("block_size", 1)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); const index_t batch_size = input->dim(0); const index_t input_depth = input->dim(1); const index_t input_height = input->dim(2); @@ -79,36 +80,50 @@ struct SpaceToDepthOpFunctor : OpKernel { } } } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: const int block_size_; }; #ifdef MACE_ENABLE_OPENCL -class OpenCLSpaceToDepthKernel { +template +class SpaceToDepthOp : public Operation { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToDepthKernel); -}; -template -struct SpaceToDepthOpFunctor : OpKernel { - explicit SpaceToDepthOpFunctor(OpKernelContext *context, - const int block_size); - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); + explicit SpaceToDepthOp(OpConstructContext *context) + : Operation(context) { + int block_size = Operation::GetOptionalArg("block_size", 1); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SpaceToDepthKernel(block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); + return kernel_->Compute(context, input, output); + } + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL +void RegisterSpaceToDepth(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "SpaceToDepth", + SpaceToDepthOp, DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "SpaceToDepth", + SpaceToDepthOp, DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "SpaceToDepth", + SpaceToDepthOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SPACE_TO_DEPTH_H_ diff --git a/mace/kernels/split.h b/mace/kernels/split.cc similarity index 55% rename from mace/kernels/split.h rename to mace/kernels/split.cc index ffef9699ccbeb0fd0404d951e0e6de85c0aeb0d4..68f5f274c9498dd81898ad66a24ca348406b2720 100644 --- a/mace/kernels/split.h +++ b/mace/kernels/split.cc @@ -12,31 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SPLIT_H_ -#define MACE_KERNELS_SPLIT_H_ - -#include #include -#include +#include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/core/types.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/kernels/opencl/image/split.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { -template -struct SplitFunctor : OpKernel { - SplitFunctor(OpKernelContext *context, const int32_t axis) - : OpKernel(context), axis_(axis) {} +template +class SplitOp; - MaceStatus operator()(const Tensor *input, - const std::vector &output_list, - StatsFuture *future) { - MACE_UNUSED(future); +template +class SplitOp : public Operation { + public: + explicit SplitOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 3)) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + MACE_CHECK(this->OutputSize() >= 2) + << "There must be at least two outputs for slicing"; + const Tensor *input = this->Input(0); + const std::vector output_list = this->Outputs(); + MACE_CHECK((input->dim(axis_) % this->OutputSize()) == 0) + << "Outputs do not split input equally."; const index_t input_channels = input->dim(axis_); const size_t outputs_count = output_list.size(); const index_t output_channels = input_channels / outputs_count; @@ -74,35 +78,56 @@ struct SplitFunctor : OpKernel { input_idx += output_channels * inner_size; } } - - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: int32_t axis_; }; + #ifdef MACE_ENABLE_OPENCL -class OpenCLSplitKernel { +template +class SplitOp : public Operation { public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - const std::vector &output_list, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSplitKernel); -}; -template -struct SplitFunctor : OpKernel { - SplitFunctor(OpKernelContext *context, const int32_t axis); + explicit SplitOp(OpConstructContext *context) + : Operation(context) { + int32_t axis = Operation::GetOptionalArg("axis", 3); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SplitKernel(axis)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + MACE_CHECK(this->OutputSize() >= 2) + << "There must be at least two outputs for slicing"; + const Tensor *input = this->Input(0); + const std::vector output_list = this->Outputs(); + int32_t axis = Operation::GetOptionalArg("axis", 3); + MACE_CHECK((input->dim(axis) % this->OutputSize()) == 0) + << "Outputs do not split input equally."; + return kernel_->Compute(context, input, output_list); + } - MaceStatus operator()(const Tensor *input, - const std::vector &output_list, - StatsFuture *future); + private: std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL + +void RegisterSplit(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Split", SplitOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Split", SplitOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "Split", SplitOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SPLIT_H_ diff --git a/mace/kernels/sqrdiff_mean.h b/mace/kernels/sqrdiff_mean.cc similarity index 58% rename from mace/kernels/sqrdiff_mean.h rename to mace/kernels/sqrdiff_mean.cc index 1c2d009cef979e9859b9c080eaae63deefd87d32..e9c7bde0a975ae3c4becf754faf126722498bf7e 100644 --- a/mace/kernels/sqrdiff_mean.h +++ b/mace/kernels/sqrdiff_mean.cc @@ -12,28 +12,45 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_SQRDIFF_MEAN_H_ -#define MACE_KERNELS_SQRDIFF_MEAN_H_ - -#include #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" +#include "mace/core/operator.h" #ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/cl2_header.h" -#endif +#include "mace/kernels/opencl/image/sqrdiff_mean.h" +#endif // MACE_ENABLE_OPENCL namespace mace { namespace kernels { template -struct SqrDiffMeanFunctor : OpKernel { - explicit SqrDiffMeanFunctor(OpKernelContext *context) - : OpKernel(context) {} +class SqrDiffMeanOp : public Operation { + public: + explicit SqrDiffMeanOp(OpConstructContext *context) + : Operation(context) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->Input(1); + Tensor *output = this->Output(0); + + MACE_CHECK(input0->dim(0) == input1->dim(0) && + input0->dim(1) == input1->dim(1), + "inputs dims N and C should be the same."); + std::vector out_shape(4); + out_shape[0] = input0->dim(0); + out_shape[1] = input0->dim(1); + out_shape[2] = 1; + out_shape[3] = 1; + + output->Resize(out_shape); + Compute(input0, input1, output); + return MaceStatus::MACE_SUCCESS; + } + + private: void Compute(const Tensor *input0, const Tensor *input1, Tensor *output) { @@ -56,54 +73,46 @@ struct SqrDiffMeanFunctor : OpKernel { output_ptr[i] /= img_size; } } - - MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); - - MACE_CHECK(input0->dim(0) == input1->dim(0) && - input0->dim(1) == input1->dim(1), - "inputs dims N and C should be the same."); - - std::vector out_shape(4); - out_shape[0] = input0->dim(0); - out_shape[1] = input0->dim(1); - out_shape[2] = 1; - out_shape[3] = 1; - - output->Resize(out_shape); - Compute(input0, input1, output); - return MACE_SUCCESS; - } }; + #ifdef MACE_ENABLE_OPENCL -class OpenCLSqrDiffMeanKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSqrDiffMeanKernel); -}; template -struct SqrDiffMeanFunctor : OpKernel { - explicit SqrDiffMeanFunctor(OpKernelContext *context); - - MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future); +class SqrDiffMeanOp : public Operation { + public: + explicit SqrDiffMeanOp(OpConstructContext *context) + : Operation(context) { + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::SqrDiffMeanKernel()); + } else { + MACE_NOT_IMPLEMENTED; + } + } + MaceStatus Run(OpContext *context) override { + const Tensor *input0 = this->Input(0); + const Tensor *input1 = this->Input(1); + Tensor *output = this->Output(0); + return kernel_->Compute(context, input0, input1, output); + } + private: std::unique_ptr kernel_; }; -#endif +#endif // MACE_ENABLE_OPENCL + + +void RegisterSqrDiffMean(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, + DeviceType::CPU, float); + +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, + DeviceType::GPU, float); + + MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp, + DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_SQRDIFF_MEAN_H_ diff --git a/mace/ops/squeeze.h b/mace/kernels/squeeze.cc similarity index 57% rename from mace/ops/squeeze.h rename to mace/kernels/squeeze.cc index 7febfb0e20b377c54493623910c64f18228da487..8221bccbedaac60db398af3476dd78eb0b500a98 100644 --- a/mace/ops/squeeze.h +++ b/mace/kernels/squeeze.cc @@ -12,27 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_SQUEEZE_H_ -#define MACE_OPS_SQUEEZE_H_ - -#include #include +#include #include "mace/core/operator.h" namespace mace { -namespace ops { +namespace kernels { -template -class SqueezeOp : public Operator { +template +class SqueezeOp : public Operation { public: - SqueezeOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - axis_(OperatorBase::GetRepeatedArgs("axis", {})) {} + explicit SqueezeOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetRepeatedArgs("axis", {})) {} - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); std::vector output_shape; std::unordered_set axis_set(axis_.begin(), axis_.end()); @@ -45,19 +43,21 @@ class SqueezeOp : public Operator { output->ReuseTensorBuffer(*input); output->Reshape(output_shape); - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } private: std::vector axis_; - - private: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace ops -} // namespace mace +void RegisterSqueeze(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half); +#endif // MACE_ENABLE_OPENCL +} -#endif // MACE_OPS_SQUEEZE_H_ +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/stack.h b/mace/kernels/stack.cc similarity index 68% rename from mace/kernels/stack.h rename to mace/kernels/stack.cc index 4d465784ed18e73ccb1084c4666e89786002c6ce..b3fc8bea56eaae32336f94bdec2fecf1388b5d39 100644 --- a/mace/kernels/stack.h +++ b/mace/kernels/stack.cc @@ -12,34 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_STACK_H_ -#define MACE_KERNELS_STACK_H_ - #include -#include -#include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { template -struct StackFunctor : OpKernel { - StackFunctor(OpKernelContext *context, int axis) - : OpKernel(context), axis_(axis) {} +class StackOp : public Operation { + public: + explicit StackOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 0)) {} - MaceStatus operator()(const std::vector &inputs, - Tensor *output, - StatsFuture *future) { + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const std::vector &inputs = this->Inputs(); + Tensor *output = this->Output(0); MACE_CHECK(!inputs.empty(), "stack inputs are empty."); std::vector input_shape = inputs[0]->shape(); MACE_CHECK(axis_ >= -(inputs[0]->dim_size() + 1) && - axis_ < inputs[0]->dim_size() + 1, + axis_ < inputs[0]->dim_size() + 1, "axis out of bound."); if (axis_ < 0) { axis_ += inputs[0]->dim_size() + 1; @@ -48,14 +43,14 @@ struct StackFunctor : OpKernel { output_shape.insert(output_shape.begin() + axis_, inputs.size()); MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - // Some inputs may be in gpu memory, so add mapping here. + // Some inputs_ may be in gpu memory, so add mapping here. std::vector mappers; for (size_t i = 0; i < inputs.size(); ++i) { mappers.emplace_back(Tensor::MappingGuard(inputs[i])); } // Output is on host, no need to map data - T *output_data = output->mutable_data(); + auto *output_data = output->mutable_data(); std::vector input_data(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { input_data[i] = inputs[i]->data(); @@ -74,15 +69,21 @@ struct StackFunctor : OpKernel { output_data += low_dim_elem_size; } } - - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: int axis_; }; +void RegisterStack(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_STACK_H_ diff --git a/mace/kernels/strided_slice.h b/mace/kernels/strided_slice.cc similarity index 77% rename from mace/kernels/strided_slice.h rename to mace/kernels/strided_slice.cc index 3f1f2c49b1d888b470a9027278558e71da6800a6..b030661bd800b6fda3bfb72baa66f159431012de 100644 --- a/mace/kernels/strided_slice.h +++ b/mace/kernels/strided_slice.cc @@ -12,49 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_STRIDED_SLICE_H_ -#define MACE_KERNELS_STRIDED_SLICE_H_ - #include -#include -#include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { template -struct StridedSliceFunctor : OpKernel { - StridedSliceFunctor(OpKernelContext *context, - int begin_mask, - int end_mask, - int ellipsis_mask, - int new_axis_mask, - int shrink_axis_mask, - bool is_slice) - : OpKernel(context), - begin_mask_(begin_mask), - end_mask_(end_mask), - ellipsis_mask_(ellipsis_mask), - new_axis_mask_(new_axis_mask), - shrink_axis_mask_(shrink_axis_mask), - is_slice_(is_slice), - tmp_strides_tensor_(context->device()->allocator(), - DataTypeToEnum::v()) {} - - MaceStatus operator()(const Tensor *input, - const Tensor *begin_indices, - const Tensor *end_indices, - const Tensor *strides, - Tensor *output, - StatsFuture *future) { +class StridedSliceOp : public Operation { + public: + explicit StridedSliceOp(OpConstructContext *context) + : Operation(context), + begin_mask_(Operation::GetOptionalArg("begin_mask", 0)), + end_mask_(Operation::GetOptionalArg("end_mask", 0)), + ellipsis_mask_(Operation::GetOptionalArg("ellipsis_mask", 0)), + new_axis_mask_(Operation::GetOptionalArg("new_axis_mask", 0)), + shrink_axis_mask_( + Operation::GetOptionalArg("shrink_axis_mask", 0)), + is_slice_(Operation::GetOptionalArg("slice", false)) { MACE_CHECK(ellipsis_mask_ == 0 && new_axis_mask_ == 0, "ellipsis_mask and new_axis_mask are not supported yet."); + } + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(INPUT); + const Tensor *begin_indices = this->Input(BEGIN); + const Tensor *end_indices = this->Input(END); + const Tensor *strides = nullptr; + if (this->InputSize() > 3) { + strides = this->Input(STRIDES); + } + Tensor *output = this->Output(OUTPUT); if (strides == nullptr) { tmp_strides_tensor_.Resize({begin_indices->size()}); Tensor::MappingGuard strides_guard(&tmp_strides_tensor_); @@ -118,16 +109,16 @@ struct StridedSliceFunctor : OpKernel { } else { real_end_indices[d] = end_indices_data[d] < -dim_len - ? -1 - : (end_indices_data[d] < 0 - ? (end_indices_data[d] + dim_len) - : std::min(static_cast(end_indices_data[d]), - dim_len)); + ? -1 + : (end_indices_data[d] < 0 + ? (end_indices_data[d] + dim_len) + : std::min(static_cast(end_indices_data[d]), + dim_len)); } int32_t out_dim_len = std::max( 0.f, std::ceil((real_end_indices[d] - real_begin_indices[d]) / - static_cast(strides_data[d]))); + static_cast(strides_data[d]))); if (!(shrink_axis_mask_ & (1 << d))) { output_shape.push_back(out_dim_len); } else { @@ -197,7 +188,7 @@ struct StridedSliceFunctor : OpKernel { : k > real_end_indices[2]; k += strides_data[2]) { *output_data++ = - input_data[(i * input->dim(1) + j) * input->dim(2) + k]; + input_data[(i * input->dim(1) + j) * input->dim(2) + k]; } } } @@ -205,11 +196,10 @@ struct StridedSliceFunctor : OpKernel { MACE_NOT_IMPLEMENTED; } } - - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: int begin_mask_; int end_mask_; int ellipsis_mask_; @@ -217,9 +207,23 @@ struct StridedSliceFunctor : OpKernel { int shrink_axis_mask_; bool is_slice_; Tensor tmp_strides_tensor_; + + MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES); + MACE_OP_OUTPUT_TAGS(OUTPUT); }; +void RegisterStridedSlice(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, + DeviceType::CPU, int32_t); +#ifdef MACE_ENABLE_OPENCL + MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, + DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp, + DeviceType::GPU, int32_t); +#endif // MACE_ENABLE_OPENCL +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_STRIDED_SLICE_H_ diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.cc similarity index 88% rename from mace/kernels/transpose.h rename to mace/kernels/transpose.cc index 04e1caed91d59a82c99315c2034ed1519be0ffc1..2ec3801542cf63e5841988995efef01a6350fd7b 100644 --- a/mace/kernels/transpose.h +++ b/mace/kernels/transpose.cc @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_TRANSPOSE_H_ -#define MACE_KERNELS_TRANSPOSE_H_ - #if defined(MACE_ENABLE_NEON) #include #endif @@ -22,10 +19,7 @@ #include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/public/mace.h" -#include "mace/utils/utils.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { @@ -104,19 +98,29 @@ static void TransposeNCHWToNHWCC2(const float *input, } } -template -struct TransposeFunctor : OpKernel { - TransposeFunctor(OpKernelContext *context, const std::vector &dims) - : OpKernel(context), dims_(dims) {} +template +class TransposeOp : public Operation { + public: + explicit TransposeOp(OpConstructContext *context) + : Operation(context), + dims_(Operation::GetRepeatedArgs("dims")) {} + + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + Tensor *output = this->Output(0); + const std::vector &input_shape = input->shape(); + MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) || + (input_shape.size() == 2 && dims_.size() == 2), + "rank should be 2 or 4"); + std::vector output_shape; + for (size_t i = 0; i < dims_.size(); ++i) { + output_shape.push_back(input_shape[dims_[i]]); + } + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future) { - MACE_UNUSED(future); Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); - const std::vector &input_shape = input->shape(); - const std::vector &output_shape = output->shape(); const T *input_data = input->data(); T *output_data = output->mutable_data(); @@ -216,13 +220,17 @@ struct TransposeFunctor : OpKernel { MACE_NOT_IMPLEMENTED; } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: std::vector dims_; }; +void RegisterTranspose(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp, + DeviceType::CPU, float); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_TRANSPOSE_H_ diff --git a/mace/kernels/unstack.h b/mace/kernels/unstack.cc similarity index 76% rename from mace/kernels/unstack.h rename to mace/kernels/unstack.cc index b193c6b5a96455bf670983eb08e505790ad6afee..8403b8f65ec51478dcb6fb628c0b0c61f43e8e9d 100644 --- a/mace/kernels/unstack.h +++ b/mace/kernels/unstack.cc @@ -12,30 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_UNSTACK_H_ -#define MACE_KERNELS_UNSTACK_H_ - #include -#include -#include #include -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/kernel.h" -#include "mace/public/mace.h" +#include "mace/core/operator.h" namespace mace { namespace kernels { template -struct UnstackFunctor : OpKernel { - UnstackFunctor(OpKernelContext *context, int axis) - : OpKernel(context), axis_(axis) {} +class UnstackOp : public Operation { + public: + explicit UnstackOp(OpConstructContext *context) + : Operation(context), + axis_(Operation::GetOptionalArg("axis", 0)) {} - MaceStatus operator()(const Tensor *input, - const std::vector &outputs, - StatsFuture *future) { + MaceStatus Run(OpContext *context) override { + MACE_UNUSED(context); + const Tensor *input = this->Input(0); + const std::vector outputs = this->Outputs(); std::vector input_shape = input->shape(); MACE_CHECK(axis_ >= -(input->dim_size()) && axis_ < input->dim_size(), "axis out of bound."); @@ -71,15 +66,19 @@ struct UnstackFunctor : OpKernel { input_idx += low_dim_elem_size; } } - - SetFutureDefaultWaitFn(future); - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } + private: int axis_; }; +void RegisterUnstack(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, + DeviceType::CPU, float); + MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp, + DeviceType::CPU, int32_t); +} + } // namespace kernels } // namespace mace - -#endif // MACE_KERNELS_UNSTACK_H_ diff --git a/mace/kernels/winograd_transform.cc b/mace/kernels/winograd_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..286bff95b38df794b46d5e3a588a8cc1aba401db --- /dev/null +++ b/mace/kernels/winograd_transform.cc @@ -0,0 +1,102 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "mace/core/operator.h" +#include "mace/kernels/activation.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/opencl/image/winograd_transform.h" + +namespace mace { +namespace kernels { + +template +class WinogradTransformOp; + +template +class WinogradTransformOp : public Operation { + public: + explicit WinogradTransformOp(OpConstructContext *context) + : Operation(context) { + Padding padding_type = static_cast(Operation::GetOptionalArg( + "padding", static_cast(VALID))); + std::vector paddings = Operation::GetRepeatedArgs( + "padding_values"); + int block_size = Operation::GetOptionalArg("wino_block_size", 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::WinogradTransformKernel( + padding_type, paddings, block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + MaceStatus Run(OpContext *context) override { + const Tensor *input_tensor = this->Input(0); + Tensor *output_tensor = this->Output(0); + return kernel_->Compute(context, input_tensor, output_tensor); + } + + private: + std::unique_ptr kernel_; +}; + +template +class WinogradInverseTransformOp; + +template +class WinogradInverseTransformOp : public Operation { + public: + explicit WinogradInverseTransformOp(OpConstructContext *context) + : Operation(context) { + ActivationType activation = kernels::StringToActivationType( + Operation::GetOptionalArg("activation", "NOOP")); + float relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); + int block_size = Operation::GetOptionalArg("wino_block_size", 2); + if (context->device()->opencl_runtime()->UseImageMemory()) { + kernel_.reset(new opencl::image::WinogradInverseTransformKernel( + activation, relux_max_limit, block_size)); + } else { + MACE_NOT_IMPLEMENTED; + } + } + + MaceStatus Run(OpContext *context) override { + Tensor *output_tensor = this->Output(0); + return kernel_->Compute(context, inputs_, output_tensor); + } + + private: + std::unique_ptr kernel_; +}; + +void RegisterWinogradTransform(OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "WinogradTransform", + WinogradTransformOp, DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "WinogradTransform", + WinogradTransformOp, DeviceType::GPU, half); +} + +void RegisterWinogradInverseTransform( + OpRegistryBase *op_registry) { + MACE_REGISTER_OP(op_registry, "WinogradInverseTransform", + WinogradInverseTransformOp, DeviceType::GPU, float); + MACE_REGISTER_OP(op_registry, "WinogradInverseTransform", + WinogradInverseTransformOp, DeviceType::GPU, half); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h deleted file mode 100644 index 313645980160fa6a34e8144eb1a14d4491e244f2..0000000000000000000000000000000000000000 --- a/mace/kernels/winograd_transform.h +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_ -#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_ - -#include -#include - -#include "mace/core/future.h" -#include "mace/core/tensor.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/conv_pool_2d_util.h" - -namespace mace { -namespace kernels { - -template -struct WinogradTransformFunctor; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLWinogradTransformKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const Tensor *input, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradTransformKernel); -}; -template -struct WinogradTransformFunctor : OpKernel { - WinogradTransformFunctor(OpKernelContext *context, - const Padding &padding_type, - const std::vector &paddings, - const int block_size); - - MaceStatus operator()(const Tensor *input, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - - -template -struct WinogradInverseTransformFunctor; - -#ifdef MACE_ENABLE_OPENCL -class OpenCLWinogradInverseTransformKernel { - public: - virtual MaceStatus Compute( - OpKernelContext *context, - const std::vector &inputs, - Tensor *output, - StatsFuture *future) = 0; - MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); -}; -template -struct WinogradInverseTransformFunctor : OpKernel { - WinogradInverseTransformFunctor(OpKernelContext *context, - const ActivationType activation, - const float relux_max_limit, - const int block_size); - - MaceStatus operator()(const std::vector &inputs, - Tensor *output, - StatsFuture *future); - - std::unique_ptr kernel_; -}; -#endif // MACE_ENABLE_OPENCL - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_WINOGRAD_TRANSFORM_H_ diff --git a/mace/libmace/BUILD b/mace/libmace/BUILD index c01b7e9f65350ca5f78540e0bcc3035b94b52016..bc44c109ed5412532a4e5987613c7b425a19f2ac 100644 --- a/mace/libmace/BUILD +++ b/mace/libmace/BUILD @@ -40,6 +40,7 @@ cc_library( deps = [ "//mace/public", "//mace/ops", + "//mace/kernels", ], alwayslink = 1, ) diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 6e1b44d8be5b7844bc5466c66a8299913c9b280e..52584abb41d4438ffb4591030c8325611bdd7f08 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -22,7 +22,8 @@ #include "mace/core/net.h" #include "mace/core/device_context.h" -#include "mace/ops/ops_register.h" +#include "mace/kernels/ops_register.h" +#include "mace/ops/ops_def_register.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -237,7 +238,7 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type) MaceStatus MaceEngineConfig::Impl::SetGPUContext( std::shared_ptr context) { gpu_context_ = context; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus MaceEngineConfig::Impl::SetGPUHints( @@ -245,7 +246,7 @@ MaceStatus MaceEngineConfig::Impl::SetGPUHints( GPUPriorityHint priority_hint) { gpu_perf_hint_ = perf_hint; gpu_priority_hint_ = priority_hint; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( @@ -255,7 +256,7 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( num_threads_ = num_threads; cpu_affinity_policy_ = policy; use_gemmlowp_ = use_gemmlowp; - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } @@ -358,7 +359,8 @@ class MaceEngine::Impl { private: const unsigned char *model_data_; size_t model_data_size_; - std::shared_ptr op_registry_; + std::unique_ptr op_def_registry_; + std::unique_ptr op_registry_; DeviceType device_type_; std::unique_ptr device_; std::unique_ptr ws_; @@ -375,7 +377,8 @@ class MaceEngine::Impl { MaceEngine::Impl::Impl(const MaceEngineConfig &config) : model_data_(nullptr), model_data_size_(0), - op_registry_(new OperatorRegistry()), + op_def_registry_(new OpDefRegistry()), + op_registry_(new OpRegistry), device_type_(config.impl_->device_type()), device_(nullptr), ws_(new Workspace()), @@ -462,10 +465,21 @@ MaceStatus MaceEngine::Impl::Init( model_data)); // Init model - auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(), - NetMode::INIT); + auto net = std::unique_ptr(new SerialNet( + op_def_registry_.get(), + op_registry_.get(), + net_def, + ws_.get(), + device_.get(), + NetMode::INIT)); + MACE_RETURN_IF_ERROR(net->Init()); MACE_RETURN_IF_ERROR(net->Run()); - net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get()); + net_ = std::unique_ptr(new SerialNet(op_def_registry_.get(), + op_registry_.get(), + net_def, + ws_.get(), + device_.get())); + MACE_RETURN_IF_ERROR(net_->Init()); #ifdef MACE_ENABLE_HEXAGON } #endif @@ -563,6 +577,7 @@ MaceStatus MaceEngine::Impl::Run( #ifdef MACE_ENABLE_OPENCL if (device_type_ == GPU) { + device_->opencl_runtime()->command_queue().finish(); device_->opencl_runtime()->SaveBuiltCLProgram(); } #endif @@ -582,10 +597,10 @@ MaceStatus MaceEngine::Impl::Run( std::memcpy(output.second.data().get(), output_tensor->data(), output_size * sizeof(float)); } else { - return MACE_INVALID_ARGS; + return MaceStatus::MACE_INVALID_ARGS; } } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } MaceEngine::MaceEngine(const MaceEngineConfig &config): diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 54a885aba5011dcf80f847540b8c262f691e8718..d039f8c84bdb665c6d19404dd8961cafa245e247 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -41,34 +41,20 @@ cc_library( "-DMACE_ENABLE_HEXAGON", ]), deps = [ - "//mace/ops", + "ops", + "//mace/kernels", "@gtest", ], ) cc_library( name = "ops", - srcs = glob( - ["*.cc"], - exclude = [ - "*_test.cc", - "*_benchmark.cc", - "ops_test_util.cc", - "buffer_transform.cc", - "buffer_inverse_transform.cc", - "lstmcell.cc", - ], - ) + if_opencl_enabled( - [ - "buffer_transform.cc", - "buffer_inverse_transform.cc", - "lstmcell.cc", - ], - ), - hdrs = glob( - ["*.h"], - exclude = glob(["*_test_util.h"]), - ), + srcs = [ + "ops_def_register.cc", + ], + hdrs = [ + "ops_def_register.h", + ], copts = [ "-Werror", "-Wextra", @@ -84,7 +70,7 @@ cc_library( "-DMACE_ENABLE_HEXAGON", ]), deps = [ - "//mace/kernels", + "//mace/core", ], ) diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc deleted file mode 100644 index 44b2ba90c035d3dda95f4bcf31497f3a3b08f205..0000000000000000000000000000000000000000 --- a/mace/ops/activation.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/activation.h" - -namespace mace { -namespace ops { - -void Register_Activation(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ActivationOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ActivationOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/activation.h b/mace/ops/activation.h deleted file mode 100644 index 3b48891e769b7133f9b780a66d6ada4760b4ee7e..0000000000000000000000000000000000000000 --- a/mace/ops/activation.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ACTIVATION_H_ -#define MACE_OPS_ACTIVATION_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/activation.h" - -namespace mace { -namespace ops { - -template -class ActivationOp : public Operator { - public: - ActivationOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - static_cast( - OperatorBase::GetOptionalArg("max_limit", 0.0f))) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(0); - const Tensor *alpha_tensor = - this->InputSize() >= 2 ? this->Input(1) : nullptr; - Tensor *output_tensor = this->Output(0); - MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor)); - - return functor_(input_tensor, alpha_tensor, output_tensor, future); - } - - private: - kernels::ActivationFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ACTIVATION_H_ diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 341b5f713fe86a87b6a3a4c1ee3c556ff8d2ebbf..1f16879f24ba03b163c4b94441043a64740631bc 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index 49422f3a11969c89e1c55453bbfc40e44f797bdc..01735e978ab5b51e655e917b30725786ea27501d 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc deleted file mode 100644 index a30cba48b4f2fb2bf6620cd23b807c6d4462f451..0000000000000000000000000000000000000000 --- a/mace/ops/addn.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/addn.h" - -namespace mace { -namespace ops { - -void Register_AddN(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - AddNOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - AddNOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/addn.h b/mace/ops/addn.h deleted file mode 100644 index 4238a013e455723f9ad88cbdec8dee79be862885..0000000000000000000000000000000000000000 --- a/mace/ops/addn.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ADDN_H_ -#define MACE_OPS_ADDN_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/addn.h" - -namespace mace { -namespace ops { - -template -class AddNOp : public Operator { - public: - AddNOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - Tensor *output_tensor = this->Output(0); - int n = this->inputs_.size(); - std::vector inputs(n, nullptr); - inputs[0] = this->Input(0); - for (int i = 1; i < n; ++i) { - inputs[i] = this->Input(i); - MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size()); - MACE_CHECK(inputs[0]->size() == inputs[i]->size()) - << "Input 0: " << MakeString(inputs[0]->shape()) - << ", size: " << inputs[0]->size() << ". Input " << i << ": " - << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size(); - } - return functor_(inputs, output_tensor, future); - } - - private: - kernels::AddNFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ADDN_H_ diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 1b72c79124cb2cd94fc53687e243fa804023d563..a155d85466c16fec1a9b3da68e6e49afe6a71040 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 7154ad52d097a9c09144f0b1d1630ca8be538e20..5116e36bd2c9145349cbf0eff76c7ded07951d79 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc deleted file mode 100644 index e14b7bb8c193b153c5a5f36c563b62b98d57607a..0000000000000000000000000000000000000000 --- a/mace/ops/argmax.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/argmax.h" - -namespace mace { -namespace ops { - -void Register_ArgMax(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ArgMax") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ArgMaxOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/argmax.h b/mace/ops/argmax.h deleted file mode 100644 index b1d7ec4efc4d7d448eb6676d838730bfd5450386..0000000000000000000000000000000000000000 --- a/mace/ops/argmax.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ARGMAX_H_ -#define MACE_OPS_ARGMAX_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/argmax.h" - -namespace mace { -namespace ops { - -template -class ArgMaxOp : public Operator { - public: - ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(0); - const Tensor *axis = this->Input(1); - Tensor *output = this->Output(0); - return functor_(input, axis, output, future); - } - - private: - kernels::ArgMaxFunctor functor_; - - MACE_OP_INPUT_TAGS(INPUT, AXIS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ARGMAX_H_ diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc index ca7ece351801ef781edbd04826a2fb285ee1f77c..06de7046d01a34e131c596204e95fca6d08f0473 100644 --- a/mace/ops/argmax_test.cc +++ b/mace/ops/argmax_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc deleted file mode 100644 index c1a6c0cf3c292df95c5f94fb42ff2ca2d1987577..0000000000000000000000000000000000000000 --- a/mace/ops/batch_norm.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/batch_norm.h" - -namespace mace { -namespace ops { - -void Register_BatchNorm(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchNormOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h deleted file mode 100644 index 7221c3ca1f10b535d1e570f4356e3720ac298a7d..0000000000000000000000000000000000000000 --- a/mace/ops/batch_norm.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_BATCH_NORM_H_ -#define MACE_OPS_BATCH_NORM_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/batch_norm.h" - -namespace mace { -namespace ops { - -template -class BatchNormOp : public Operator { - public: - BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, false, kernels::ActivationType::NOOP, 0.0f) { - epsilon_ = OperatorBase::GetOptionalArg("epsilon", - static_cast(1e-4)); - } - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *scale = this->Input(SCALE); - const Tensor *offset = this->Input(OFFSET); - const Tensor *mean = this->Input(MEAN); - const Tensor *var = this->Input(VAR); - - MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", - input->dim_size()); - MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", - scale->dim_size()); - MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", - offset->dim_size()); - MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ", - mean->dim_size()); - MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ", - var->dim_size()); - - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - return functor_(input, scale, offset, - mean, var, epsilon_, output, future); - } - - private: - float epsilon_; - kernels::BatchNormFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_BATCH_NORM_H_ diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 648ddfca6b1606cd52f5343767d48533766d1143..c390860e79599092ec8d624626caa9e4c712bd51 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 7d5b77daf1eb3cdb7a4402f83657421618ff2f44..3c22d5ff5981977c04d598585e0faead97534bf2 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc deleted file mode 100644 index 103e12977c01021fc0a3d4008558935da96b9ab5..0000000000000000000000000000000000000000 --- a/mace/ops/batch_to_space.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/batch_to_space.h" - -namespace mace { -namespace ops { - -void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BatchToSpaceNDOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h deleted file mode 100644 index 458db28481f82860372dea3ba7a55ce8ea50f404..0000000000000000000000000000000000000000 --- a/mace/ops/batch_to_space.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_BATCH_TO_SPACE_H_ -#define MACE_OPS_BATCH_TO_SPACE_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/batch_to_space.h" - -namespace mace { -namespace ops { - -template -class BatchToSpaceNDOp : public Operator { - public: - BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("crops", {0, 0, 0, 0}), - OperatorBase::GetRepeatedArgs("block_shape", {1, 1})) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *batch_tensor = this->Input(INPUT); - Tensor *space_tensor = this->Output(OUTPUT); - return functor_(batch_tensor, space_tensor, future); - } - - private: - kernels::BatchToSpaceFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_BATCH_TO_SPACE_H_ diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index c6b3e25a5b95208db2042384e00460333040ba96..4cf55b334f5e47bcf82953fa1f31148b1494a22c 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc deleted file mode 100644 index bf082cf9286940858f7ef5eb9cfeae06b43252af..0000000000000000000000000000000000000000 --- a/mace/ops/bias_add.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/bias_add.h" - -namespace mace { -namespace ops { - -void Register_BiasAdd(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BiasAddOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h deleted file mode 100644 index ee3de99116fea2a49153c2d1f79a73b570f8b02d..0000000000000000000000000000000000000000 --- a/mace/ops/bias_add.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_BIAS_ADD_H_ -#define MACE_OPS_BIAS_ADD_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/bias_add.h" - -namespace mace { -namespace ops { - -template -class BiasAddOp : public Operator { - public: - BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - static_cast(OperatorBase::GetOptionalArg( - "data_format", NHWC))) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *bias = this->Input(BIAS); - - MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ", - bias->dim_size()); - - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - return functor_(input, bias, output, future); - } - - private: - kernels::BiasAddFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, BIAS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_BIAS_ADD_H_ diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index ca8500edf9170893eee1f1458db82347254e5282..5908caa2d9adc74e54539ae859672c780b471b65 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 51c8cc8871f370f878025b919bde91d92d39fba1..771065c28caceecd7ef46886bb9e6bfaa5bd284e 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc deleted file mode 100644 index af52d482c95d4185c6ab5ee5cf9f7aa0bc52c688..0000000000000000000000000000000000000000 --- a/mace/ops/buffer_inverse_transform.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/buffer_inverse_transform.h" - -namespace mace { -namespace ops { - -void Register_BufferInverseTransform(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferInverseTransformOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferInverseTransformOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/buffer_inverse_transform.h b/mace/ops/buffer_inverse_transform.h deleted file mode 100644 index 9eefb0f0be2e6f3fbbdbe7253fd56a67fefb1b1c..0000000000000000000000000000000000000000 --- a/mace/ops/buffer_inverse_transform.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ -#define MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/buffer_inverse_transform.h" - -namespace mace { -namespace ops { - -template -class BufferInverseTransformOp : public Operator { - public: - BufferInverseTransformOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetOptionalArg("wino_block_size", 2)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - - kernels::BufferType type = - static_cast(OperatorBase::GetOptionalArg( - "buffer_type", static_cast(kernels::CONV2D_FILTER))); - return functor_(input, type, output, future); - } - - private: - kernels::BufferInverseTransformFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 7d94c525f7a59011d53f3e6f9f78538e1f91b271..fb1cf51cb58ed0a2336a72bac3a278cd4cee6bbb 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc deleted file mode 100644 index bab1b894d4955f9269c905ef57f970537d3d837b..0000000000000000000000000000000000000000 --- a/mace/ops/buffer_transform.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/buffer_transform.h" - -namespace mace { -namespace ops { - -void Register_BufferTransform(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferTransformOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - BufferTransformOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/buffer_transform.h b/mace/ops/buffer_transform.h deleted file mode 100644 index 94a4779f94ae752cdb779b50c234fa1e679f790c..0000000000000000000000000000000000000000 --- a/mace/ops/buffer_transform.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_BUFFER_TRANSFORM_H_ -#define MACE_OPS_BUFFER_TRANSFORM_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/buffer_transform.h" - -namespace mace { -namespace ops { - -template -class BufferTransformOp : public Operator { - public: - BufferTransformOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetOptionalArg("wino_block_size", 2)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(INPUT); - - kernels::BufferType type = - static_cast(OperatorBase::GetOptionalArg( - "buffer_type", static_cast(kernels::CONV2D_FILTER))); - Tensor *output = this->Output(OUTPUT); - - return functor_(input_tensor, type, output, future); - } - - private: - kernels::BufferTransformFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace -#endif // MACE_OPS_BUFFER_TRANSFORM_H_ diff --git a/mace/ops/cast.cc b/mace/ops/cast.cc deleted file mode 100644 index 87abfdd46eac3c4064ea448569d396005434970d..0000000000000000000000000000000000000000 --- a/mace/ops/cast.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/cast.h" - -namespace mace { -namespace ops { - -void Register_Cast(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Cast") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - CastOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Cast") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - CastOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/cast_test.cc b/mace/ops/cast_test.cc index f35d3af6ee6237db274e891e6db37e1fd31fa366..a00649937e51577829f6fd30361133f95fdb8cfe 100644 --- a/mace/ops/cast_test.cc +++ b/mace/ops/cast_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "gmock/gmock.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc deleted file mode 100644 index e13ac92a60390d5e76277a3afec4222e56336ab9..0000000000000000000000000000000000000000 --- a/mace/ops/channel_shuffle.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/channel_shuffle.h" - -namespace mace { -namespace ops { - -void Register_ChannelShuffle(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ChannelShuffleOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h deleted file mode 100644 index a459a0b38e115ace4e4333ce5ca3dc5539f61afe..0000000000000000000000000000000000000000 --- a/mace/ops/channel_shuffle.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_CHANNEL_SHUFFLE_H_ -#define MACE_OPS_CHANNEL_SHUFFLE_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/channel_shuffle.h" - -namespace mace { -namespace ops { - -template -class ChannelShuffleOp : public Operator { - public: - ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - group_(OperatorBase::GetOptionalArg("group", 1)), - functor_(context, this->group_) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - int channels; - if (D == GPU) { - channels = input->dim(3); - } else if (D == CPU) { - channels = input->dim(1); - } else { - MACE_NOT_IMPLEMENTED; - } - MACE_CHECK(channels % group_ == 0, - "input channels must be an integral multiple of group. ", - input->dim(3)); - return functor_(input, output, future); - } - - protected: - const int group_; - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); - - private: - kernels::ChannelShuffleFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_CHANNEL_SHUFFLE_H_ diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index 49f494c8a3bf23f64253ea68d314cbdc484d8f1a..d45216ebcbccd2fa2a2485db63abaa80a18d7da3 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index 2102fe7652b2b552d8f9c8caeb09abfa786c1a57..1ce0cea1aee8940d7d345a4c25a5948b461c1892 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc deleted file mode 100644 index 6a860a42b4c13ab3e37ebd27a287ae2cc5e4dbf8..0000000000000000000000000000000000000000 --- a/mace/ops/concat.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/concat.h" - -namespace mace { -namespace ops { - -void Register_Concat(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ConcatOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ConcatOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ConcatOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ConcatOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/concat.h b/mace/ops/concat.h deleted file mode 100644 index 94dee3d33dd8876183bb9934874b6f1cd4d2766f..0000000000000000000000000000000000000000 --- a/mace/ops/concat.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_CONCAT_H_ -#define MACE_OPS_CONCAT_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/concat.h" - -namespace mace { -namespace ops { - -template -class ConcatOp : public Operator { - public: - ConcatOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} - - MaceStatus Run(StatsFuture *future) override { - MACE_CHECK(this->InputSize() >= 2) - << "There must be at least two inputs to concat"; - const std::vector input_list = this->Inputs(); - const int32_t concat_axis = OperatorBase::GetOptionalArg("axis", 3); - const int32_t input_dims = input_list[0]->dim_size(); - const int32_t axis = - concat_axis < 0 ? concat_axis + input_dims : concat_axis; - MACE_CHECK((0 <= axis && axis < input_dims), - "Expected concatenating axis in the range [", -input_dims, ", ", - input_dims, "], but got", concat_axis); - - Tensor *output = this->Output(OUTPUT); - - return functor_(input_list, output, future); - } - - private: - kernels::ConcatFunctor functor_; - - private: - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_CONCAT_H_ diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 5375cb6d487c6d8211aaca90bd8fc3cd23de56ec..486d9b6e78d6f100abe528998000032209e25c4f 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 671b8f617e700ba8e25805e1c802f04c692cf657..431e7a2d4a893da86e501e46ad741b2ee600764b 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -16,7 +16,6 @@ #include #include "gmock/gmock.h" -#include "mace/ops/concat.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc deleted file mode 100644 index 516520f961b85484578e083370a470d597cc6a4d..0000000000000000000000000000000000000000 --- a/mace/ops/conv_2d.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/conv_2d.h" - -namespace mace { -namespace ops { - -void Register_Conv2D(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Conv2dOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h deleted file mode 100644 index 9f731fa483a41e53b71ee0915667eedc6a72c605..0000000000000000000000000000000000000000 --- a/mace/ops/conv_2d.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_CONV_2D_H_ -#define MACE_OPS_CONV_2D_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/conv_2d.h" -#include "mace/ops/conv_pool_2d_base.h" - -namespace mace { -namespace ops { - -template -class Conv2dOp : public ConvPool2dOpBase { - public: - Conv2dOp(const OperatorDef &op_def, OpKernelContext *context) - : ConvPool2dOpBase(op_def, context), - functor_(context, - this->strides_.data(), - this->padding_type_, - this->paddings_, - this->dilations_.data(), - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *filter = this->Input(FILTER); - const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; - Tensor *output = this->Output(OUTPUT); - return functor_(input, filter, bias, output, future); - } - - private: - kernels::Conv2dFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_CONV_2D_H_ diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 313cd35bbcc4253c029535666cfba30b3ba1fdd2..76e3696d9207f20c54dd9ddab6d29b1bade4af74 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -14,9 +14,9 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/ops/conv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 354b1935edff27fe2b917409eccc9573bb5f2b53..28037011ec8fe36d1c069adcfde0a8f4a0e44177 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -15,8 +15,7 @@ #include #include -#include "mace/kernels/quantize.h" -#include "mace/ops/conv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index 6c1b25de265619aaaa665ad153271b53fc567a01..5afd621fbe6220ed03e62c0cec281258da5dd0dd 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -52,17 +52,26 @@ TEST(CoreTest, INIT_MODE) { NetDef net_def; for (auto &op_def : op_defs) { net_def.add_op()->CopyFrom(op_def); + net_def.add_op_types(op_def.type()); } - std::shared_ptr op_registry(new OperatorRegistry()); - auto net = - CreateNet(op_registry, net_def, &ws, device, NetMode::INIT); - net->Run(); + std::shared_ptr op_def_registry(new OpDefRegistry()); + std::shared_ptr op_registry(new OpRegistry()); + auto net = std::unique_ptr(new SerialNet( + op_def_registry.get(), op_registry.get(), &net_def, &ws, device, + NetMode::INIT)); + MaceStatus status = net->Init(); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); + status = net->Run(); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); EXPECT_TRUE(ws.GetTensor("Output") == nullptr); - - net = CreateNet(op_registry, net_def, &ws, device); - net->Run(); + net = std::unique_ptr(new SerialNet( + op_def_registry.get(), op_registry.get(), &net_def, &ws, device)); + status = net->Init(); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); + status = net->Run(); + MACE_CHECK(status == MaceStatus::MACE_SUCCESS); EXPECT_TRUE(ws.GetTensor("Output") != nullptr); ExpectTensorNear(*ws.GetTensor("Input"), *ws.GetTensor("Output"), diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc deleted file mode 100644 index 7ed2e9c01999b73e5bb0eb223a80759e012a787e..0000000000000000000000000000000000000000 --- a/mace/ops/crop.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/crop.h" - -namespace mace { -namespace ops { - -void Register_Crop(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - CropOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - CropOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - CropOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/crop.h b/mace/ops/crop.h deleted file mode 100644 index f50450693580a0d193cac1975e5903e1e624cfd5..0000000000000000000000000000000000000000 --- a/mace/ops/crop.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_CROP_H_ -#define MACE_OPS_CROP_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/crop.h" - -namespace mace { -namespace ops { - -template -class CropOp : public Operator { - public: - CropOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetOptionalArg("axis", 2), - OperatorBase::GetRepeatedArgs("offset")) {} - - MaceStatus Run(StatsFuture *future) override { - MACE_CHECK(this->InputSize() >= 2) - << "There must be two inputs to crop"; - const std::vector input_list = this->Inputs(); - Tensor *output = this->Output(0); - return functor_(input_list, output, future); - } - - private: - kernels::CropFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_CROP_H_ diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc index e3d22a0752cb851834c7d198422418c2afdae398..75cd494f78ed588da80e0184f8489b8e0a6155a3 100644 --- a/mace/ops/crop_benchmark.cc +++ b/mace/ops/crop_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index b4bb7fddf1e1c630e5ad3897fcd7558fe5c5662c..67a2fdeb3f75baeecefc1d1bda9e6c2419feaafc 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc deleted file mode 100644 index af0d7232e3e42745c24241f915006acb5623c64e..0000000000000000000000000000000000000000 --- a/mace/ops/deconv_2d.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/deconv_2d.h" - -namespace mace { -namespace ops { - -void Register_Deconv2D(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - Deconv2dOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h deleted file mode 100644 index 03c4581d4af177d7918f41094668c3860ae74a3e..0000000000000000000000000000000000000000 --- a/mace/ops/deconv_2d.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_DECONV_2D_H_ -#define MACE_OPS_DECONV_2D_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/deconv_2d.h" - -namespace mace { -namespace ops { - -template -class Deconv2dOp : public Operator { - public: - Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("strides"), - static_cast(OperatorBase::GetOptionalArg( - "padding", static_cast(SAME))), - OperatorBase::GetRepeatedArgs("padding_values"), - static_cast( - OperatorBase::GetOptionalArg("framework_type", 0)), - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} - - MaceStatus Run(StatsFuture *future) override { - MACE_CHECK(this->InputSize() >= 2, "deconv needs >= 2 inputs."); - const Tensor *input = this->Input(0); - const Tensor *filter = this->Input(1); - kernels::FrameworkType model_type = - static_cast( - OperatorBase::GetOptionalArg("framework_type", 0)); - if (model_type == kernels::CAFFE) { - const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr; - Tensor *output = this->Output(OUTPUT); - - return functor_(input, filter, bias, nullptr, output, future); - } else { - const Tensor *output_shape = - this->InputSize() >= 3 ? this->Input(2) : nullptr; - const Tensor *bias = this->InputSize() >= 4 ? this->Input(3) : nullptr; - Tensor *output = this->Output(OUTPUT); - - return functor_(input, filter, bias, output_shape, output, future); - } - } - - private: - kernels::Deconv2dFunctor functor_; - - protected: - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_DECONV_2D_H_ diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index cece56ce567b2bbcab688c1f8101ff3ff949d365..197e8f73968fca1544d611a8bb81845e04495a1a 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -14,9 +14,9 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/ops/deconv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index ef4d426f05ea84942ab90f36950e88426315903a..884764148ab98c463f2a2f7806c20aefb491c3db 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -15,7 +15,8 @@ #include #include -#include "mace/ops/deconv_2d.h" +#include "mace/kernels/deconv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc deleted file mode 100644 index 0da2bb00865d0a1b47a3295bf143e600fd392c6a..0000000000000000000000000000000000000000 --- a/mace/ops/depth_to_space.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/depth_to_space.h" - -namespace mace { -namespace ops { - -void Register_DepthToSpace(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthToSpaceOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h deleted file mode 100644 index c2946b849a72ea1215e49c4875887c4cf0b49b0d..0000000000000000000000000000000000000000 --- a/mace/ops/depth_to_space.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_DEPTH_TO_SPACE_H_ -#define MACE_OPS_DEPTH_TO_SPACE_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/depth_to_space.h" - -namespace mace { -namespace ops { - -template -class DepthToSpaceOp : public Operator { - public: - DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - block_size_(OperatorBase::GetOptionalArg("block_size", 1)), - functor_(context, this->block_size_) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); - - return functor_(input, output, future); - } - - protected: - const int block_size_; - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); - - private: - kernels::DepthToSpaceOpFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_DEPTH_TO_SPACE_H_ diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index 431151671dac90fb6b4e535db62ea256ff17d794..45bc60363854d4b044bdf7acd4fd7b04ec8602ab 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 768f7c1a0e8e98a698d6b9e256ffe6da8a4978c4..fdce99c129fe4430ba1cef9a3114153b0abaf773 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -15,7 +15,7 @@ #include #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc deleted file mode 100644 index 61f87e5f98be2e5e7466e0f1ad5c16608f52a73b..0000000000000000000000000000000000000000 --- a/mace/ops/depthwise_conv2d.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/depthwise_conv2d.h" - -namespace mace { -namespace ops { - -void Register_DepthwiseConv2d(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - DepthwiseConv2dOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h deleted file mode 100644 index 549af07a3977b65464288a96096b42cb22c2ac6d..0000000000000000000000000000000000000000 --- a/mace/ops/depthwise_conv2d.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_DEPTHWISE_CONV2D_H_ -#define MACE_OPS_DEPTHWISE_CONV2D_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/conv_2d.h" -#include "mace/kernels/depthwise_conv2d.h" -#include "mace/ops/conv_pool_2d_base.h" - -namespace mace { -namespace ops { - -template -class DepthwiseConv2dOp : public ConvPool2dOpBase { - public: - DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context) - : ConvPool2dOpBase(op_def, context), - functor_(context, - this->strides_.data(), - this->padding_type_, - this->paddings_, - this->dilations_.data(), - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *filter = this->Input(FILTER); - const Tensor *bias = nullptr; - if (this->InputSize() >= 3) { - bias = this->Input(BIAS); - } - Tensor *output = this->Output(OUTPUT); - return functor_(input, filter, bias, output, future); - } - - private: - kernels::DepthwiseConv2dFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_DEPTHWISE_CONV2D_H_ diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 60abfaf3bb99b3dc62f008983de7bec110b618a2..3257e580c5f852d9c9a08a89539afc5d2860f0a7 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -14,9 +14,9 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/ops/conv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 39dd69449b91b88fa167cb51e1b8a8e9655a70ee..3089286c30061090e54d05cbf0dc8826719caec2 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/conv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc deleted file mode 100644 index 2e82fb70bd03c966d579cb6ce3fdeca25a93b755..0000000000000000000000000000000000000000 --- a/mace/ops/eltwise.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/eltwise.h" - -namespace mace { -namespace ops { - -void Register_Eltwise(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - EltwiseOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h deleted file mode 100644 index f795256218eed2087d372e1acdbe5ba1db2fce96..0000000000000000000000000000000000000000 --- a/mace/ops/eltwise.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_ELTWISE_H_ -#define MACE_OPS_ELTWISE_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/eltwise.h" - -namespace mace { -namespace ops { - -template -class EltwiseOp : public Operator { - public: - EltwiseOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_( - context, - static_cast(OperatorBase::GetOptionalArg( - "type", static_cast(kernels::EltwiseType::NONE))), - OperatorBase::GetRepeatedArgs("coeff"), - OperatorBase::GetOptionalArg("scalar_input", 1.0), - OperatorBase::GetOptionalArg("scalar_input_index", 1), - static_cast(OperatorBase::GetOptionalArg( - "data_format", 0))) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input0 = this->Input(0); - const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr; - Tensor *output = this->Output(OUTPUT); - return functor_(input0, input1, output, future); - } - - private: - kernels::EltwiseFunctor functor_; - - private: - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_ELTWISE_H_ diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index d12c97b3aa87886b5c90db60a02a792c7b0d561e..4a8fa0415e2222c2c771ab7ecbdb4e173ebe38f7 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/kernels/eltwise.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index d1506987983faac18bc58af87f70002095f79f2b..da9687ce1ef321f312ce6faa0ec829075581666a 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "mace/kernels/eltwise.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc deleted file mode 100644 index 5b10d5a37a68644edaa30a111b00a350ac0a28ec..0000000000000000000000000000000000000000 --- a/mace/ops/expand_dims.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/expand_dims.h" - -namespace mace { -namespace ops { - -void Register_ExpandDims(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ExpandDimsOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ExpandDimsOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ExpandDimsOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/expand_dims.h b/mace/ops/expand_dims.h deleted file mode 100644 index b7363c3c6e3930fdb716c90c7ce932032835d715..0000000000000000000000000000000000000000 --- a/mace/ops/expand_dims.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_EXPAND_DIMS_H_ -#define MACE_OPS_EXPAND_DIMS_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/expand_dims.h" - -namespace mace { -namespace ops { - -template -class ExpandDimsOp : public Operator { - public: - ExpandDimsOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - - return functor_(input, output, future); - } - - private: - kernels::ExpandDimsFunctor functor_; - - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_EXPAND_DIMS_H_ diff --git a/mace/ops/expand_dims_test.cc b/mace/ops/expand_dims_test.cc index f5650c9cd141514d4fe47167e72f48cc79ad1646..ac3312eac245c8098fc93125b9f5f1c7f09aaa22 100644 --- a/mace/ops/expand_dims_test.cc +++ b/mace/ops/expand_dims_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "gmock/gmock.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/fill.cc b/mace/ops/fill.cc deleted file mode 100644 index 93e6daddcf50c9db4b7dea2196a2e275e2620d18..0000000000000000000000000000000000000000 --- a/mace/ops/fill.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/fill.h" - -namespace mace { -namespace ops { - -void Register_Fill(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Fill") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FillOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/fill.h b/mace/ops/fill.h deleted file mode 100644 index b6836d11978d7263439b03eda7b072feacf06c19..0000000000000000000000000000000000000000 --- a/mace/ops/fill.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_FILL_H_ -#define MACE_OPS_FILL_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/fill.h" - -namespace mace { -namespace ops { - -template -class FillOp : public Operator { - public: - FillOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *shape = this->Input(SHAPE); - const Tensor *value = this->Input(VALUE); - Tensor *output = this->Output(OUTPUT); - return functor_(shape, value, output, future); - } - - private: - kernels::FillFunctor functor_; - - MACE_OP_INPUT_TAGS(SHAPE, VALUE); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_FILL_H_ diff --git a/mace/ops/fill_test.cc b/mace/ops/fill_test.cc index 1808b0b52bbbe2ab9ac46246b63a83477292895e..8ecbed5d2823b7cfc12f3967ce6dea6ff7b882cc 100644 --- a/mace/ops/fill_test.cc +++ b/mace/ops/fill_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc deleted file mode 100644 index f760075077396e699b16f3da15a8f57e4523623b..0000000000000000000000000000000000000000 --- a/mace/ops/folded_batch_norm.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/folded_batch_norm.h" - -namespace mace { -namespace ops { - -void Register_FoldedBatchNorm(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FoldedBatchNormOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h deleted file mode 100644 index 345d87b476ded184fa7b02ba8c47072589e41bc6..0000000000000000000000000000000000000000 --- a/mace/ops/folded_batch_norm.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_ -#define MACE_OPS_FOLDED_BATCH_NORM_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/batch_norm.h" - -namespace mace { -namespace ops { - -template -class FoldedBatchNormOp : public Operator { - public: - FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - true, - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *scale = this->Input(SCALE); - const Tensor *offset = this->Input(OFFSET); - - MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", - input->dim_size()); - MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", - scale->dim_size()); - MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", - offset->dim_size()); - - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - return functor_(input, scale, offset, nullptr, nullptr, 0, output, future); - } - - private: - kernels::BatchNormFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_FOLDED_BATCH_NORM_H_ diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 16a6ad684809436832569a285158105e3b9137f2..a19d7d7730ebe84f3b8f97a64b3ab1ad4fdad12e 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { @@ -51,7 +51,7 @@ void Simple() { if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") .Input("Offset") @@ -68,7 +68,7 @@ void Simple() { BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputImage") .Input("ScaleImage") .Input("OffsetImage") @@ -115,7 +115,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") .Input("Offset") @@ -140,7 +140,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputImage") .Input("ScaleImage") .Input("OffsetImage") @@ -177,7 +177,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") .Input("Offset") @@ -202,7 +202,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputImage") .Input("ScaleImage") .Input("OffsetImage") @@ -240,7 +240,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") .Input("Offset") @@ -265,7 +265,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputImage") .Input("ScaleImage") .Input("OffsetImage") @@ -301,7 +301,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") .Input("Offset") @@ -326,7 +326,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { BufferToImage(&net, "Offset", "OffsetImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") + OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputImage") .Input("ScaleImage") .Input("OffsetImage") diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc deleted file mode 100644 index 31f3bf869c07c0bf1ebbf14deeb234365624a9b2..0000000000000000000000000000000000000000 --- a/mace/ops/fully_connected.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/fully_connected.h" - -namespace mace { -namespace ops { - -void Register_FullyConnected(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - FullyConnectedOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h deleted file mode 100644 index 313780cb3b9b39d568005ee84fa154390b13e827..0000000000000000000000000000000000000000 --- a/mace/ops/fully_connected.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_FULLY_CONNECTED_H_ -#define MACE_OPS_FULLY_CONNECTED_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/fully_connected.h" - -namespace mace { -namespace ops { - -template -class FullyConnectedOp : public Operator { - public: - FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *weight = this->Input(WEIGHT); // OIHW - const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; - Tensor *output = this->Output(OUTPUT); - - if (D == DeviceType::CPU) { - MACE_CHECK( - input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) && - input->dim(3) == weight->dim(3), - "The shape of Input: ", MakeString(input->shape()), - "The shape of Weight: ", MakeString(weight->shape()), - " don't match."); - } else { - MACE_CHECK( - input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) && - input->dim(3) == weight->dim(1), - "The shape of Input: ", MakeString(input->shape()), - "The shape of Weight: ", MakeString(weight->shape()), - " don't match."); - } - if (bias) { - MACE_CHECK(weight->dim(0) == bias->dim(0), - "The shape of Weight: ", MakeString(weight->shape()), - " and shape of Bias: ", bias->dim(0), - " don't match."); - } - - return functor_(input, weight, - bias, output, future); - } - - private: - kernels::FullyConnectedFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_FULLY_CONNECTED_H_ diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index 9f0fe549af4b903d89d478aecd5d70857ea94e4e..66af8792ab13b9d0bb4b9402702fdd55ed229f5a 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 4bdc53f4cab924886e1d9df5c582bac26e2dfa7c..d075aac297072944d32af03b6fbc3670b4b90b3f 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -14,8 +14,7 @@ #include -#include "mace/core/operator.h" -#include "mace/kernels/quantize.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc deleted file mode 100644 index 12891c5d9ce00db7fd1dd25a4145a07b922f797b..0000000000000000000000000000000000000000 --- a/mace/ops/gather.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/gather.h" - -namespace mace { -namespace ops { - -void Register_Gather(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Gather") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - GatherOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/gather.h b/mace/ops/gather.h deleted file mode 100644 index fe4026d969835cc1dc456258194d40d7fb120584..0000000000000000000000000000000000000000 --- a/mace/ops/gather.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_GATHER_H_ -#define MACE_OPS_GATHER_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/gather.h" - -namespace mace { -namespace ops { - -template -class GatherOp : public Operator { - public: - GatherOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetOptionalArg("axis", 0), - OperatorBase::GetOptionalArg("y", 1.0)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *params = this->Input(PARAMS); - const Tensor *indices = this->Input(INDICES); - Tensor *output = this->Output(OUTPUT); - - return functor_(params, indices, output, future); - } - - private: - kernels::GatherFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(PARAMS, INDICES); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_GATHER_H_ diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc index f55b74620564930a7ae2319dd8f30ae3e9468251..8a0cd123a51293e94b85d048dda4db460d18ac8d 100644 --- a/mace/ops/gather_benchmark.cc +++ b/mace/ops/gather_benchmark.cc @@ -14,9 +14,8 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" -#include "mace/kernels/gather.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc index 07a8438c515c88a9ae2631f79e52f27d45bfe237..c716b12ada8b9d3794ecc9aba605649c703aae9f 100644 --- a/mace/ops/gather_test.cc +++ b/mace/ops/gather_test.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc deleted file mode 100644 index 61a3335672e4d8b0f2e358dc40728d4271ea174e..0000000000000000000000000000000000000000 --- a/mace/ops/identity.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/identity.h" - -namespace mace { -namespace ops { - -void Register_Identity(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - IdentityOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - IdentityOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - IdentityOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - IdentityOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/identity_test.cc b/mace/ops/identity_test.cc index 988ce760c56d96a79f14520a857ce300e4869b00..1ef8848d739e4037f2cac5fbb5df03f7bd4c1054 100644 --- a/mace/ops/identity_test.cc +++ b/mace/ops/identity_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "gmock/gmock.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc deleted file mode 100644 index 26aec062354d6f65d699f02f4ef976fba118fa97..0000000000000000000000000000000000000000 --- a/mace/ops/infer_conv2d_shape.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/infer_conv2d_shape.h" - -namespace mace { -namespace ops { - -void Register_InferConv2dShape(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - InferConv2dShapeOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - InferConv2dShapeOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - InferConv2dShapeOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - InferConv2dShapeOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc index 4f2e0b769e237ab1e04eef07114480718e867b00..735a599c723edfff0308f73d76671e41503d8e0c 100644 --- a/mace/ops/infer_conv2d_shape_test.cc +++ b/mace/ops/infer_conv2d_shape_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" #include "mace/kernels/conv_pool_2d_util.h" diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc deleted file mode 100644 index f3e199706da84f9bb902eed8cb427a211e79261b..0000000000000000000000000000000000000000 --- a/mace/ops/local_response_norm.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/local_response_norm.h" - -namespace mace { -namespace ops { - -void Register_LocalResponseNorm(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - LocalResponseNormOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h deleted file mode 100644 index 66265f19e0fcef441e7374072c17cdd525e47f71..0000000000000000000000000000000000000000 --- a/mace/ops/local_response_norm.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_LOCAL_RESPONSE_NORM_H_ -#define MACE_OPS_LOCAL_RESPONSE_NORM_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/local_response_norm.h" - -namespace mace { -namespace ops { - -template -class LocalResponseNormOp : public Operator { - public: - LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), functor_(context) { - depth_radius_ = OperatorBase::GetOptionalArg("depth_radius", 5); - bias_ = OperatorBase::GetOptionalArg("bias", 1.0f); - alpha_ = OperatorBase::GetOptionalArg("alpha", 1.0f); - beta_ = OperatorBase::GetOptionalArg("beta", 0.5f); - } - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - - MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", - input->dim_size()); - - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future); - } - - private: - int depth_radius_; - float bias_; - float alpha_; - float beta_; - kernels::LocalResponseNormFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_LOCAL_RESPONSE_NORM_H_ diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc index ee15c3e01223164f5978415deb9f862e45fa320b..893b65d1e812143a4960cd0fd01b489dbc0f645a 100644 --- a/mace/ops/local_response_norm_benchmark.cc +++ b/mace/ops/local_response_norm_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc index 6bb726ead5bf3f8fbe6173013d99557cbed03209..55adcedd54be519bf3522bf559849b4c723383a2 100644 --- a/mace/ops/local_response_norm_test.cc +++ b/mace/ops/local_response_norm_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/lstmcell.cc b/mace/ops/lstmcell.cc deleted file mode 100644 index 9926ad4b89db9626293063a0f43e9a2d49f51a23..0000000000000000000000000000000000000000 --- a/mace/ops/lstmcell.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/lstmcell.h" - -namespace mace { -namespace ops { - -void Register_LSTMCell(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LSTMCell") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - LSTMCellOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LSTMCell") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - LSTMCellOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc index 6ab6baa1b1b0aafdbe8540d435f305f01d433971..a1972e72abc72b71a05f6228567256ee4d4ccd54 100644 --- a/mace/ops/lstmcell_benchmark.cc +++ b/mace/ops/lstmcell_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/lstmcell_test_util.h" diff --git a/mace/ops/lstmcell_test.cc b/mace/ops/lstmcell_test.cc index 1cfaad0179f24ce62ea99f1ea0d3a067711e0f38..5b26c6779290c27a6b7c98642ab37dbe1ffc7561 100644 --- a/mace/ops/lstmcell_test.cc +++ b/mace/ops/lstmcell_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/eltwise.h" #include "mace/ops/lstmcell_test_util.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/lstmcell_test_util.h b/mace/ops/lstmcell_test_util.h index 06d711516a903ff0119eb89a6b1c92ad6a03d030..bbd523c95a97552c818091248525828d441b3d05 100644 --- a/mace/ops/lstmcell_test_util.h +++ b/mace/ops/lstmcell_test_util.h @@ -17,7 +17,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/eltwise.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc deleted file mode 100644 index ca0b68e55b1569ec135a793431a8b6c9b4b51cf1..0000000000000000000000000000000000000000 --- a/mace/ops/matmul.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/matmul.h" - -namespace mace { -namespace ops { - -void Register_MatMul(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - MatMulOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - MatMulOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - MatMulOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h deleted file mode 100644 index 64b336a38d98a52be95917a7d9750abdc1c6e1a9..0000000000000000000000000000000000000000 --- a/mace/ops/matmul.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_MATMUL_H_ -#define MACE_OPS_MATMUL_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/matmul.h" - -namespace mace { -namespace ops { - -template -class MatMulOp : public Operator { - public: - MatMulOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context), - transpose_a_(OperatorBase::GetOptionalArg("transpose_a", false)), - transpose_b_(OperatorBase::GetOptionalArg("transpose_b", false)) { - } - - MaceStatus Run(StatsFuture *future) override { - const Tensor *A = this->Input(INPUT_A); - const Tensor *B = this->Input(INPUT_B); - Tensor *C = this->Output(OUTPUT); - MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2, - "rank(A) should be equal to rank(B), rank should be greater " - "than or equal to 2"); - index_t rank = A->dim_size(); - for (index_t i = 0; i < rank - 2; ++i) { - MACE_CHECK(A->dim(i) == B->dim(i), - "batch dimensions are not equal: ", - A->dim(i), - " vs. ", - B->dim(i)); - } - index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1); - index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2); - MACE_CHECK(ak == bk, "the number of A's column ", ak, - " must be equal to B's row ", bk); - - return functor_(A, B, C, - transpose_a_, transpose_b_, future); - } - - private: - MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B); - MACE_OP_OUTPUT_TAGS(OUTPUT); - - kernels::MatMulFunctor functor_; - bool transpose_a_; - bool transpose_b_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_MATMUL_H_ diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 08b06fa7e83cc9d07436b1f2149766f798d6fb1a..c553e33d538dba84847b8eeca39be3372d2a2f0e 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index 9225b2269d5f37f36c412b26695ab07f36788b69..83958c75ef27bc29de4fc21626d25029e696bb28 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/ops_def_register.cc b/mace/ops/ops_def_register.cc new file mode 100644 index 0000000000000000000000000000000000000000..46ee5184a4c666a4a42540c9b294110cab245242 --- /dev/null +++ b/mace/ops/ops_def_register.cc @@ -0,0 +1,373 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/ops/ops_def_register.h" + +#include + +namespace mace { +namespace ops { + +void RegisterOpDefs(OpDefRegistryBase *op_def_registry) { + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Activation") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("AddN") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ArgMax") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("BatchNorm") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("BatchToSpaceND") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("BiasAdd") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("BufferInverseTransform") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("BufferTransform") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Cast") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ChannelShuffle") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Concat") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Conv2D") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Crop") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Deconv2D") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("DepthToSpace") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("DepthwiseConv2d") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Dequantize") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Eltwise") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ExpandDims") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Fill") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("FullyConnected") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Gather") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Identity") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("InferConv2dShape") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("LocalResponseNorm") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("LSTMCell") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("MatMul") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Pad") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Pooling") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Quantize") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ReduceMean") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Reshape") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ResizeBicubic") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ResizeBilinear") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Reverse") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("ScalarMath") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Shape") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Softmax") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("SpaceToBatchND") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("SpaceToDepth") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Split") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("SqrDiffMean") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Squeeze") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Stack") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("StridedSlice") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Transpose") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("Unstack") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::CPU, DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("WinogradInverseTransform") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::GPU}; + })); + + MACE_REGISTER_OP_DEF( + op_def_registry, + OpRegistrationBuilder("WinogradTransform") + .SetDevicePlaceFunc([]() -> std::vector { + return {DeviceType::GPU}; + })); +} +} // namespace ops + + +OpDefRegistry::OpDefRegistry() : OpDefRegistryBase() { + ops::RegisterOpDefs(this); +} + +} // namespace mace diff --git a/mace/kernels/kernel.h b/mace/ops/ops_def_register.h similarity index 68% rename from mace/kernels/kernel.h rename to mace/ops/ops_def_register.h index 853e974f76a5667c326c85346bfd3ba274b2cd9f..5b2d6acbf9ba951473f93cbe93d5b654d72f9baf 100644 --- a/mace/kernels/kernel.h +++ b/mace/ops/ops_def_register.h @@ -12,20 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_KERNELS_KERNEL_H_ -#define MACE_KERNELS_KERNEL_H_ +#ifndef MACE_OPS_OPS_DEF_REGISTER_H_ +#define MACE_OPS_OPS_DEF_REGISTER_H_ -#include "mace/core/op_kernel_context.h" +#include "mace/core/op_def_registry.h" namespace mace { -namespace kernels { -struct OpKernel { - explicit OpKernel(OpKernelContext *context): context_(context) {} - - OpKernelContext *context_; +class OpDefRegistry : public OpDefRegistryBase { + public: + OpDefRegistry(); + ~OpDefRegistry() override = default; }; -} // namespace kernels } // namespace mace -#endif // MACE_KERNELS_KERNEL_H_ + +#endif // MACE_OPS_OPS_DEF_REGISTER_H_ diff --git a/mace/ops/ops_register.cc b/mace/ops/ops_register.cc deleted file mode 100644 index 1c29386c67b30ef31647e7a611b3385b50d38fad..0000000000000000000000000000000000000000 --- a/mace/ops/ops_register.cc +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/ops_register.h" - -namespace mace { - -namespace ops { -// Keep in lexicographical order -extern void Register_Activation(OperatorRegistryBase *op_registry); -extern void Register_AddN(OperatorRegistryBase *op_registry); -extern void Register_ArgMax(OperatorRegistryBase *op_registry); -extern void Register_BatchNorm(OperatorRegistryBase *op_registry); -extern void Register_BatchToSpaceND(OperatorRegistryBase *op_registry); -extern void Register_BiasAdd(OperatorRegistryBase *op_registry); -extern void Register_Cast(OperatorRegistryBase *op_registry); -extern void Register_ChannelShuffle(OperatorRegistryBase *op_registry); -extern void Register_Concat(OperatorRegistryBase *op_registry); -extern void Register_Conv2D(OperatorRegistryBase *op_registry); -extern void Register_Crop(OperatorRegistryBase *op_registry); -extern void Register_Deconv2D(OperatorRegistryBase *op_registry); -extern void Register_DepthToSpace(OperatorRegistryBase *op_registry); -extern void Register_DepthwiseConv2d(OperatorRegistryBase *op_registry); -extern void Register_Dequantize(OperatorRegistryBase *op_registry); -extern void Register_Eltwise(OperatorRegistryBase *op_registry); -extern void Register_ExpandDims(OperatorRegistryBase *op_registry); -extern void Register_Fill(OperatorRegistryBase *op_registry); -extern void Register_FoldedBatchNorm(OperatorRegistryBase *op_registry); -extern void Register_FullyConnected(OperatorRegistryBase *op_registry); -extern void Register_Gather(OperatorRegistryBase *op_registry); -extern void Register_Identity(OperatorRegistryBase *op_registry); -extern void Register_InferConv2dShape(OperatorRegistryBase *op_registry); -extern void Register_LocalResponseNorm(OperatorRegistryBase *op_registry); -extern void Register_MatMul(OperatorRegistryBase *op_registry); -extern void Register_Pad(OperatorRegistryBase *op_registry); -extern void Register_Pooling(OperatorRegistryBase *op_registry); -extern void Register_Proposal(OperatorRegistryBase *op_registry); -extern void Register_Quantize(OperatorRegistryBase *op_registry); -extern void Register_ReduceMean(OperatorRegistryBase *op_registry); -extern void Register_Reshape(OperatorRegistryBase *op_registry); -extern void Register_ResizeBicubic(OperatorRegistryBase *op_registry); -extern void Register_ResizeBilinear(OperatorRegistryBase *op_registry); -extern void Register_Reverse(OperatorRegistryBase *op_registry); -extern void Register_ScalarMath(OperatorRegistryBase *op_registry); -extern void Register_Shape(OperatorRegistryBase *op_registry); -extern void Register_Softmax(OperatorRegistryBase *op_registry); -extern void Register_SpaceToBatchND(OperatorRegistryBase *op_registry); -extern void Register_SpaceToDepth(OperatorRegistryBase *op_registry); -extern void Register_Split(OperatorRegistryBase *op_registry); -extern void Register_SqrDiffMean(OperatorRegistryBase *op_registry); -extern void Register_Squeeze(OperatorRegistryBase *op_registry); -extern void Register_Stack(OperatorRegistryBase *op_registry); -extern void Register_StridedSlice(OperatorRegistryBase *op_registry); -extern void Register_Transpose(OperatorRegistryBase *op_registry); -extern void Register_Unstack(OperatorRegistryBase *op_registry); -extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry); // NOLINT(whitespace/line_length) -extern void Register_WinogradTransform(OperatorRegistryBase *op_registry); - -#ifdef MACE_ENABLE_OPENCL -extern void Register_BufferTransform(OperatorRegistryBase *op_registry); -extern void Register_BufferInverseTransform(OperatorRegistryBase *op_registry); -extern void Register_LSTMCell(OperatorRegistryBase *op_registry); -#endif // MACE_ENABLE_OPENCL -} // namespace ops - - -OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() { - // Keep in lexicographical order - ops::Register_Activation(this); - ops::Register_AddN(this); - ops::Register_ArgMax(this); - ops::Register_BatchNorm(this); - ops::Register_BatchToSpaceND(this); - ops::Register_BiasAdd(this); - ops::Register_Cast(this); - ops::Register_ChannelShuffle(this); - ops::Register_Concat(this); - ops::Register_Conv2D(this); - ops::Register_Crop(this); - ops::Register_Deconv2D(this); - ops::Register_DepthToSpace(this); - ops::Register_DepthwiseConv2d(this); - ops::Register_Dequantize(this); - ops::Register_Eltwise(this); - ops::Register_ExpandDims(this); - ops::Register_Fill(this); - ops::Register_FoldedBatchNorm(this); - ops::Register_FullyConnected(this); - ops::Register_Gather(this); - ops::Register_Identity(this); - ops::Register_InferConv2dShape(this); - ops::Register_LocalResponseNorm(this); - ops::Register_MatMul(this); - ops::Register_Pad(this); - ops::Register_Pooling(this); - ops::Register_Proposal(this); - ops::Register_Quantize(this); - ops::Register_ReduceMean(this); - ops::Register_Reshape(this); - ops::Register_ResizeBicubic(this); - ops::Register_ResizeBilinear(this); - ops::Register_Reverse(this); - ops::Register_ScalarMath(this); - ops::Register_Shape(this); - ops::Register_Softmax(this); - ops::Register_SpaceToBatchND(this); - ops::Register_SpaceToDepth(this); - ops::Register_Split(this); - ops::Register_Stack(this); - ops::Register_StridedSlice(this); - ops::Register_SqrDiffMean(this); - ops::Register_Squeeze(this); - ops::Register_Transpose(this); - ops::Register_Unstack(this); - ops::Register_WinogradInverseTransform(this); - ops::Register_WinogradTransform(this); - -#ifdef MACE_ENABLE_OPENCL - ops::Register_BufferTransform(this); - ops::Register_BufferInverseTransform(this); - ops::Register_LSTMCell(this); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index a3b8c4d9168168eed9d7c44edaf22b4081ab1fd6..4823bd80a9078c3d9b220b5ce92a262542ee9ec5 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -32,7 +32,9 @@ #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/kernels/opencl/common.h" -#include "mace/ops/ops_register.h" +#include "mace/kernels/ops_register.h" +#include "mace/ops/ops_def_register.h" +#include "mace/public/mace.h" #include "mace/utils/utils.h" #include "mace/utils/quantize.h" @@ -139,8 +141,8 @@ class OpTestContext { class OpsTestNet { public: OpsTestNet() : - op_registry_(new OperatorRegistry()) { - } + op_def_registry_(new OpDefRegistry()), + op_registry_(new OpRegistry()) {} template void AddInputFromArray(const std::string &name, @@ -453,16 +455,24 @@ class OpsTestNet { NetDef net_def; for (auto &op_def_ : op_defs_) { net_def.add_op()->CopyFrom(op_def_); + net_def.add_op_types(op_def_.type()); } - net_ = CreateNet(op_registry_, net_def, &ws_, - OpTestContext::Get()->GetDevice(device)); + net_ = std::unique_ptr(new SerialNet( + op_def_registry_.get(), + op_registry_.get(), + &net_def, + &ws_, + OpTestContext::Get()->GetDevice(device))); + MaceStatus status = net_->Init(); device_type_ = device; - return net_ != nullptr; + return status == MaceStatus::MACE_SUCCESS; } MaceStatus Run() { MACE_CHECK_NOTNULL(net_); - return net_->Run(); + MACE_RETURN_IF_ERROR(net_->Run()); + Sync(); + return MaceStatus::MACE_SUCCESS; } // DEPRECATED(liyin): @@ -477,7 +487,7 @@ class OpsTestNet { Setup(device); MACE_RETURN_IF_ERROR(Run()); } - return MACE_SUCCESS; + return MaceStatus::MACE_SUCCESS; } else { Setup(device); return Run(); @@ -491,14 +501,22 @@ class OpsTestNet { MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { device_type_ = device; - net_ = CreateNet(op_registry_, - net_def, - &ws_, - OpTestContext::Get()->GetDevice(device), - NetMode::INIT); - MACE_RETURN_IF_ERROR(net_->Run()); - net_ = CreateNet(op_registry_, net_def, &ws_, - OpTestContext::Get()->GetDevice(device)); + auto net = std::unique_ptr(new SerialNet( + op_def_registry_.get(), + op_registry_.get(), + &net_def, + &ws_, + OpTestContext::Get()->GetDevice(device), + NetMode::INIT)); + MACE_RETURN_IF_ERROR(net->Init()); + MACE_RETURN_IF_ERROR(net->Run()); + net_ = std::unique_ptr(new SerialNet( + op_def_registry_.get(), + op_registry_.get(), + &net_def, + &ws_, + OpTestContext::Get()->GetDevice(device))); + MACE_RETURN_IF_ERROR(net_->Init()); return net_->Run(); } @@ -520,7 +538,8 @@ class OpsTestNet { } public: - std::shared_ptr op_registry_; + std::shared_ptr op_def_registry_; + std::shared_ptr op_registry_; Workspace ws_; std::vector op_defs_; std::unique_ptr net_; diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc deleted file mode 100644 index e6d468b22cf1fa642d06b627e57d4c5f8f7e727a..0000000000000000000000000000000000000000 --- a/mace/ops/pad.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/pad.h" - -namespace mace { -namespace ops { - -void Register_Pad(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PadOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PadOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/pad.h b/mace/ops/pad.h deleted file mode 100644 index 6a7ce1027946497cb287618a9320b33887aafcdd..0000000000000000000000000000000000000000 --- a/mace/ops/pad.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_PAD_H_ -#define MACE_OPS_PAD_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/pad.h" - -namespace mace { -namespace ops { - -template -class PadOp : public Operator { - public: - PadOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("paddings"), - OperatorBase::GetOptionalArg("constant_value", 0.0)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(0); - Tensor *output_tensor = this->Output(0); - return functor_(input_tensor, output_tensor, future); - } - - private: - kernels::PadFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_PAD_H_ diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index c5172f8ca9600ea9225fb3929078df5d0ee3d7a4..ad8a12546e426e8e27c60479aed15e94511a52d4 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index 3a68248eb5dfc157b3c3111e910b2928fb9b6369..a8c2267fac29069dcc2cdd4589a9b1288d3284ae 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc deleted file mode 100644 index b16fd2612dc64b4ef393badcefc05c806c855b74..0000000000000000000000000000000000000000 --- a/mace/ops/pooling.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/pooling.h" - -namespace mace { -namespace ops { - -void Register_Pooling(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PoolingOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - PoolingOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - PoolingOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h deleted file mode 100644 index 3d1753b399489766da17a2245ef2dc4f92f8683d..0000000000000000000000000000000000000000 --- a/mace/ops/pooling.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_POOLING_H_ -#define MACE_OPS_POOLING_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/pooling.h" -#include "mace/ops/conv_pool_2d_base.h" - -namespace mace { -namespace ops { - -template -class PoolingOp : public ConvPool2dOpBase { - public: - PoolingOp(const OperatorDef &op_def, OpKernelContext *context) - : ConvPool2dOpBase(op_def, context), - kernels_(OperatorBase::GetRepeatedArgs("kernels")), - pooling_type_( - static_cast(OperatorBase::GetOptionalArg( - "pooling_type", static_cast(AVG)))), - functor_(context, - pooling_type_, - kernels_.data(), - this->strides_.data(), - this->padding_type_, - this->paddings_, - this->dilations_.data()) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - - return functor_(input, output, future); - }; - - protected: - std::vector kernels_; - PoolingType pooling_type_; - kernels::PoolingFunctor functor_; - - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_POOLING_H_ diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 36b9d607acaf7d1da094e30055fc81963e4f9018..e5199001387d900fd789fa140f0135c973697a67 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/kernels/pooling.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/pooling.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index c22e9b133b3c57a023be8cedfc318e7404dd6155..4cd432d54fa634b9f36333a64466cd45f7730c3e 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -14,11 +14,10 @@ #include "gtest/gtest.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/pooling.h" -#include "mace/ops/conv_pool_2d_base.h" +#include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" -#include "mace/kernels/quantize.h" namespace mace { namespace ops { diff --git a/mace/ops/proposal.cc b/mace/ops/proposal.cc deleted file mode 100644 index 2b75eeafe777aa887602bbedb879185335ef3fa9..0000000000000000000000000000000000000000 --- a/mace/ops/proposal.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/proposal.h" - -namespace mace { -namespace ops { - -void Register_Proposal(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ProposalOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h deleted file mode 100644 index d879e240ca200d5fbd09212a7e0ecde68314c47e..0000000000000000000000000000000000000000 --- a/mace/ops/proposal.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_PROPOSAL_H_ -#define MACE_OPS_PROPOSAL_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/proposal.h" - -namespace mace { -namespace ops { - -template -class ProposalOp : public Operator { - public: - ProposalOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetOptionalArg("min_size", 16), - OperatorBase::GetOptionalArg("nms_thresh", 0.7), - OperatorBase::GetOptionalArg("pre_nms_top_n", 6000), - OperatorBase::GetOptionalArg("post_nms_top_n", 300), - OperatorBase::GetOptionalArg("feat_stride", 0), - OperatorBase::GetOptionalArg("base_size", 12), - OperatorBase::GetRepeatedArgs("scales"), - OperatorBase::GetRepeatedArgs("ratios")) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *rpn_cls_prob = this->Input(RPN_CLS_PROB); - const Tensor *rpn_bbox_pred = this->Input(RPN_BBOX_PRED); - const Tensor *img_info = this->Input(IMG_INFO); - - Tensor *output = this->Output(ROIS); - - return functor_(rpn_cls_prob, rpn_bbox_pred, img_info, output, future); - } - - private: - kernels::ProposalFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO); - MACE_OP_OUTPUT_TAGS(ROIS); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_PROPOSAL_H_ diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc deleted file mode 100644 index e8b2ae5aad79dbab8f08e89006a7e38ff40360d0..0000000000000000000000000000000000000000 --- a/mace/ops/proposal_test.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/operator.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -class ProposalOpTest : public OpsTestBase {}; - -TEST_F(ProposalOpTest, CPUSimple) { - const int img_height = 256; - const int img_width = 256; - const int height = 3; - const int width = 4; - - OpsTestNet net; - - OpDefBuilder("Proposal", "ProposalTest") - .Input("RpnCLSProb") - .Input("RpnBBoxPred") - .Input("ImgInfo") - .AddIntArg("min_size", 16) - .AddFloatArg("nms_thresh", 0.7) - .AddIntArg("pre_nms_top_n", 12000) - .AddIntArg("post_nms_top_n", 2000) - .AddIntArg("feat_stride", 16) - .AddIntArg("base_size", 16) - .AddIntsArg("scales", {8, 16, 32}) - .AddFloatsArg("ratios", {0.5, 1, 2}) - .Output("Output") - .Finalize(net.NewOperatorDef()); - - std::vector scores(height * width * 18); - for (size_t i = 0; i < scores.size(); ++i) { - scores[i] = i; - } - - // Add input data - net.AddInputFromArray("RpnCLSProb", - {1, height, width, 18}, scores); - net.AddRepeatedInput("RpnBBoxPred", - {1, height, width, 4 * 9}, 1); - net.AddInputFromArray("ImgInfo", {1, 1, 1, 3}, - {img_height, img_width, 2}); - - // Run - net.RunOp(); - - auto expected_tensor = net.CreateTensor({1, 1, 1, 5}, - {0, 0, 0, 255, 255}); - - ExpectTensorNear(*expected_tensor, *net.GetTensor("Output"), 1e-5); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc deleted file mode 100644 index 35f61ac90a567b262f27a325fe1220b4643e4545..0000000000000000000000000000000000000000 --- a/mace/ops/quantize.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/quantize.h" - -namespace mace { -namespace ops { - -void Register_Quantize(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - QuantizeOp); -} - -void Register_Dequantize(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - DequantizeOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h deleted file mode 100644 index 2e7a77c2c624e5cc551898bc0b6d971eba580b1a..0000000000000000000000000000000000000000 --- a/mace/ops/quantize.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_QUANTIZE_H_ -#define MACE_OPS_QUANTIZE_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/quantize.h" - -namespace mace { -namespace ops { - -template -class QuantizeOp : public Operator { - public: - QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context), - non_zero_( - static_cast(OperatorBase::GetOptionalArg("non_zero", - 0))) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - return functor_(input, non_zero_, output, future); - } - - private: - kernels::QuantizeFunctor functor_; - bool non_zero_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -template -class DequantizeOp : public Operator { - public: - DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - - return functor_(input, output, future); - } - - private: - kernels::DequantizeFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_QUANTIZE_H_ diff --git a/mace/ops/quantize_test.cc b/mace/ops/quantize_test.cc index 5f9fd0d8f271370f6e172d38d1d68e66a12ac4f0..207ab4e4bfe2e6ae9eea11a63948d709f2c4378e 100644 --- a/mace/ops/quantize_test.cc +++ b/mace/ops/quantize_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc deleted file mode 100644 index ee4d171681ba56f2bfff5490d5742c8aeec9c70c..0000000000000000000000000000000000000000 --- a/mace/ops/reduce_mean.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/reduce_mean.h" - -namespace mace { -namespace ops { - -void Register_ReduceMean(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReduceMeanOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ReduceMeanOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ReduceMeanOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/reduce_mean.h b/mace/ops/reduce_mean.h deleted file mode 100644 index 0ef9c10274abbb28b6fb86bba2591e28ab0e38d2..0000000000000000000000000000000000000000 --- a/mace/ops/reduce_mean.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_REDUCE_MEAN_H_ -#define MACE_OPS_REDUCE_MEAN_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/reduce_mean.h" - -namespace mace { -namespace ops { - -template -class ReduceMeanOp : public Operator { - public: - ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("axis"), - OperatorBase::GetOptionalArg("keepdims", false)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const std::vector axis = - OperatorBase::GetRepeatedArgs("axis"); - const int left = static_cast(input->dim_size() * -1); - const int right = static_cast(input->dim_size()); - if (axis.size()) { - for (unsigned int i = 0; i < axis.size(); ++i) { - MACE_CHECK(axis[i] > left && axis[i] < right, "Axis is over range."); - } - } - Tensor *output = this->Output(OUTPUT); - - return functor_(input, output, future); - } - - private: - kernels::ReduceMeanFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REDUCE_MEAN_H_ diff --git a/mace/ops/reduce_mean_benchmark.cc b/mace/ops/reduce_mean_benchmark.cc index 3591c9b1e7c260f121cb855fee64f5f6b46515fc..02f6d44715004c7e43fb1405b98b1514fce8708c 100644 --- a/mace/ops/reduce_mean_benchmark.cc +++ b/mace/ops/reduce_mean_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc index b1bbe5cc61455c54063d8ba7ca27b68195be00fe..24ff7a4ab3fa22ea80cfe5a8eb95efcadbb0091f 100644 --- a/mace/ops/reduce_mean_test.cc +++ b/mace/ops/reduce_mean_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc deleted file mode 100644 index 2831aeba12d1632e1e23773b4ccbba0fa2cee9e6..0000000000000000000000000000000000000000 --- a/mace/ops/reshape.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/reshape.h" - -namespace mace { -namespace ops { - -void Register_Reshape(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); - - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ReshapeOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/reshape_test.cc b/mace/ops/reshape_test.cc index 947e968b9dac4d7f163a635a56da14b619f883ce..bdc7ab9742a3becc365e2ee27ad1bb4edf78a089 100644 --- a/mace/ops/reshape_test.cc +++ b/mace/ops/reshape_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "gmock/gmock.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc deleted file mode 100644 index 7a50522f54ed1005bb8c12c138a7158f4034e496..0000000000000000000000000000000000000000 --- a/mace/ops/resize_bicubic.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/resize_bicubic.h" - -namespace mace { -namespace ops { - -void Register_ResizeBicubic(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ResizeBicubicOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBicubicOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBicubicOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h deleted file mode 100644 index df9fc11c269ad6eed0dedc9e99b9bf4c98af3ebe..0000000000000000000000000000000000000000 --- a/mace/ops/resize_bicubic.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_RESIZE_BICUBIC_H_ -#define MACE_OPS_RESIZE_BICUBIC_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/resize_bicubic.h" - -namespace mace { -namespace ops { - -template -class ResizeBicubicOp : public Operator { - public: - ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetOptionalArg("align_corners", false), - OperatorBase::GetRepeatedArgs("size", {-1, -1})) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); - - MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", - input->dim_size()); - - return functor_(input, output, future); - } - - private: - kernels::ResizeBicubicFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_RESIZE_BICUBIC_H_ - diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc index ba22f4fecdf49267f9f845a0879fe1f38e7faa0f..f0847e4cf48adf911f35254201af67adb871545f 100644 --- a/mace/ops/resize_bicubic_benchmark.cc +++ b/mace/ops/resize_bicubic_benchmark.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 97da04804395fdbe13e1fef70ca619ce4f06c771..8dc1dbf7811983a5b82bc62b6e6639c47fbea5ee 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -14,9 +14,8 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/resize_bicubic.h" namespace mace { namespace ops { diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc deleted file mode 100644 index 3106256955383366225fd24b97bbff8b49e9132d..0000000000000000000000000000000000000000 --- a/mace/ops/resize_bilinear.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/resize_bilinear.h" - -namespace mace { -namespace ops { - -void Register_ResizeBilinear(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ResizeBilinearOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h deleted file mode 100644 index f328a9a45e152b162ea0b7e978d078b0d5dbac29..0000000000000000000000000000000000000000 --- a/mace/ops/resize_bilinear.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_RESIZE_BILINEAR_H_ -#define MACE_OPS_RESIZE_BILINEAR_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/resize_bilinear.h" - -namespace mace { -namespace ops { - -template -class ResizeBilinearOp : public Operator { - public: - ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("size", {-1, -1}), - OperatorBase::GetOptionalArg("align_corners", false)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); - - MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.", - input->dim_size()); - - return functor_(input, output, future); - } - - private: - kernels::ResizeBilinearFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_RESIZE_BILINEAR_H_ diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 993d7269a44f44de337dd4403f31e085ca557f3e..2fd6b6c223095d3968e9705a1d0a830fab43ea94 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index c628bd9b1649fc6d6abb440dd70e294921131f62..3ff5372ad32685faf2c2988fc0b2d929f83a7523 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -14,9 +14,8 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/resize_bilinear.h" namespace mace { namespace ops { diff --git a/mace/ops/reverse.cc b/mace/ops/reverse.cc deleted file mode 100644 index 4660fba7dd9f73de277aa0893585c639f85578de..0000000000000000000000000000000000000000 --- a/mace/ops/reverse.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/reverse.h" - -namespace mace { -namespace ops { - -void Register_Reverse(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reverse") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ReverseOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/reverse.h b/mace/ops/reverse.h deleted file mode 100644 index a753a4e26f7c5ce06ff96c5241e7b9a493f8f185..0000000000000000000000000000000000000000 --- a/mace/ops/reverse.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_REVERSE_H_ -#define MACE_OPS_REVERSE_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/reverse.h" - -namespace mace { -namespace ops { - -template -class ReverseOp : public Operator { - public: - ReverseOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *axis = this->Input(AXIS); - Tensor *output = this->Output(OUTPUT); - return functor_(input, axis, output, future); - } - - private: - kernels::ReverseFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, AXIS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_REVERSE_H_ diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc index c6352fab0680c81b33ca463ff711b1e70a65851b..40f2f908cb27011bd77672389d93f1c04246273e 100644 --- a/mace/ops/reverse_benchmark.cc +++ b/mace/ops/reverse_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/reverse_test.cc b/mace/ops/reverse_test.cc index afa17e502e9800556eacc7eebd0700c7e429a58f..282214fdcdfffddce5544e62a8ed8d81085f8471 100644 --- a/mace/ops/reverse_test.cc +++ b/mace/ops/reverse_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc deleted file mode 100644 index 82ef3eb3b3205adb45fb89f5d30c77af47355921..0000000000000000000000000000000000000000 --- a/mace/ops/scalar_math.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/scalar_math.h" - -namespace mace { -namespace ops { - -void Register_ScalarMath(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ScalarMathOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ScalarMathOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ScalarMathOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ScalarMathOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/scalar_math.h b/mace/ops/scalar_math.h deleted file mode 100644 index 356c93719894353a35459371b9f04d5f821a540a..0000000000000000000000000000000000000000 --- a/mace/ops/scalar_math.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SCALAR_MATH_H_ -#define MACE_OPS_SCALAR_MATH_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/scalar_math.h" - -namespace mace { -namespace ops { - -template -class ScalarMathOp : public Operator { - public: - ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - static_cast( - OperatorBase::GetOptionalArg( - "type", static_cast(kernels::EltwiseType::NONE))), - OperatorBase::GetRepeatedArgs("coeff"), - OperatorBase::GetOptionalArg("scalar_input", 1.0), - OperatorBase::GetOptionalArg( - "scalar_input_index", 1)) {} - - MaceStatus Run(StatsFuture *future) override { - const std::vector input_list = this->Inputs(); - Tensor *output = this->Output(0); - return functor_(input_list, output, future); - } - - private: - kernels::ScalarMathFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SCALAR_MATH_H_ diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc index 0d34b80abb16cf4e7f6126f2d74e9c5ce8770fe0..99caa07d1a57dde8a4820b39e81e4250ef3faf99 100644 --- a/mace/ops/scalar_math_test.cc +++ b/mace/ops/scalar_math_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" #include "mace/kernels/eltwise.h" diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc deleted file mode 100644 index 6815496fe3bd5a801b4881922e38dc515b7a877c..0000000000000000000000000000000000000000 --- a/mace/ops/shape.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/shape.h" - -namespace mace { -namespace ops { - -void Register_Shape(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - ShapeOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ShapeOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - ShapeOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/shape_test.cc b/mace/ops/shape_test.cc index 08ccb88b86958bb4fdbd3a1677fe1b728355f5fe..2b66c7ebff53b49260c9aeb10f61dc9a3b76e7dd 100644 --- a/mace/ops/shape_test.cc +++ b/mace/ops/shape_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc deleted file mode 100644 index 64586329b7a9817d4d85b63a722d305c2e5f0f17..0000000000000000000000000000000000000000 --- a/mace/ops/softmax.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/softmax.h" - -namespace mace { -namespace ops { - -void Register_Softmax(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SoftmaxOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h deleted file mode 100644 index 047402f0c0c5bf45f25ff58405359013e6ce0fa4..0000000000000000000000000000000000000000 --- a/mace/ops/softmax.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SOFTMAX_H_ -#define MACE_OPS_SOFTMAX_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/softmax.h" - -namespace mace { -namespace ops { - -template -class SoftmaxOp : public Operator { - public: - SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *logits = this->Input(LOGITS); - - Tensor *output = this->Output(OUTPUT); - MACE_RETURN_IF_ERROR(output->ResizeLike(logits)); - - return functor_(logits, output, future); - } - - private: - kernels::SoftmaxFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(LOGITS); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SOFTMAX_H_ diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 009d1aaeca7f5e9945d4292d593e641556e08b34..482709ade8e8ddc6e495916291e582c6344c3d61 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 012424c5b5d3deeed00fc73beb05b02063cd3374..98b0ad9783cc2e4670ff9467e430e0b502d8504e 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc deleted file mode 100644 index 29dbed979a842bb7601b83d5d0c6c27610bc988f..0000000000000000000000000000000000000000 --- a/mace/ops/space_to_batch.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/space_to_batch.h" - -namespace mace { -namespace ops { - -void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToBatchNDOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h deleted file mode 100644 index fabd7bb235197f9b0101b46e77709b2d86977346..0000000000000000000000000000000000000000 --- a/mace/ops/space_to_batch.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SPACE_TO_BATCH_H_ -#define MACE_OPS_SPACE_TO_BATCH_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/space_to_batch.h" - -namespace mace { -namespace ops { - -template -class SpaceToBatchNDOp : public Operator { - public: - SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetRepeatedArgs("paddings", {0, 0, 0, 0}), - OperatorBase::GetRepeatedArgs("block_shape", {1, 1})) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *space_tensor = this->Input(INPUT); - Tensor *batch_tensor = this->Output(OUTPUT); - return functor_(space_tensor, batch_tensor, future); - } - - private: - kernels::SpaceToBatchFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SPACE_TO_BATCH_H_ diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index faff487a710752c501150588740ece24e7b4e45a..565ad5dcccbfc9e586706867a9efded57285a7cd 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc deleted file mode 100644 index 67b520f6487f2115771fe6e0d05c7576febb4fd8..0000000000000000000000000000000000000000 --- a/mace/ops/space_to_depth.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/space_to_depth.h" - -namespace mace { -namespace ops { - -void Register_SpaceToDepth(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SpaceToDepthOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h deleted file mode 100644 index 6d078e2ff8ad0f4568470f3a143a4aefae37f16b..0000000000000000000000000000000000000000 --- a/mace/ops/space_to_depth.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SPACE_TO_DEPTH_H_ -#define MACE_OPS_SPACE_TO_DEPTH_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/space_to_depth.h" - -namespace mace { -namespace ops { - -template -class SpaceToDepthOp : public Operator { - public: - SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - OperatorBase::GetOptionalArg("block_size", 1)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - MACE_CHECK(input->dim_size() == 4, "input dim should be 4"); - return functor_(input, output, future); - } - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); - - private: - kernels::SpaceToDepthOpFunctor functor_; -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SPACE_TO_DEPTH_H_ diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 97d3cb033ef6a54222a38e16194009ad97b09eae..480a042159684a69db3af576574852a442f8a379 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc index c1168a650ffdda94502272dec5b56594bc11dc4f..765694924c9151e824098cc603f1cb5903af89f8 100644 --- a/mace/ops/space_to_depth_test.cc +++ b/mace/ops/space_to_depth_test.cc @@ -15,7 +15,7 @@ #include #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/split.cc b/mace/ops/split.cc deleted file mode 100644 index e5e103d7b1dfedb0e3ef26b9f2fbe0e84525ee6f..0000000000000000000000000000000000000000 --- a/mace/ops/split.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/split.h" - -namespace mace { -namespace ops { - -void Register_Split(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SplitOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SplitOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SplitOp); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/split.h b/mace/ops/split.h deleted file mode 100644 index aa41aa15c6bb6a2f181d514b916859c252aeffb1..0000000000000000000000000000000000000000 --- a/mace/ops/split.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SPLIT_H_ -#define MACE_OPS_SPLIT_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/split.h" - -namespace mace { -namespace ops { - -template -class SplitOp : public Operator { - public: - SplitOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, OperatorBase::GetOptionalArg("axis", 3)) {} - - MaceStatus Run(StatsFuture *future) override { - MACE_CHECK(this->OutputSize() >= 2) - << "There must be at least two outputs for slicing"; - const Tensor *input = this->Input(INPUT); - const std::vector output_list = this->Outputs(); - const int32_t split_axis = OperatorBase::GetOptionalArg("axis", 3); - MACE_CHECK((input->dim(split_axis) % this->OutputSize()) == 0) - << "Outputs do not split input equally."; - - return functor_(input, output_list, future); - } - - private: - kernels::SplitFunctor functor_; - - private: - MACE_OP_INPUT_TAGS(INPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SPLIT_H_ diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index 8dea1263c8f1761b33b4dd63be7ef25915e4157b..aa0e8fba507dfa4d294ad057e33dff41bdc82570 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc index 57544d18c62741434ea1162e179c3ff1856ab43a..d42b3716beb9ed214345ffc1667b6a7376195833 100644 --- a/mace/ops/split_test.cc +++ b/mace/ops/split_test.cc @@ -17,7 +17,6 @@ #include "gmock/gmock.h" #include "mace/ops/ops_test_util.h" -#include "mace/ops/split.h" namespace mace { namespace ops { diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc deleted file mode 100644 index d8e8bd515900d08bb12a7f1e242ab6d09babfba5..0000000000000000000000000000000000000000 --- a/mace/ops/sqrdiff_mean.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/sqrdiff_mean.h" - -namespace mace { -namespace ops { - -void Register_SqrDiffMean(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SqrDiffMeanOp); -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SqrDiffMeanOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SqrDiffMeanOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/sqrdiff_mean.h b/mace/ops/sqrdiff_mean.h deleted file mode 100644 index f021c0b23de6300c56c86899206c601dd89534aa..0000000000000000000000000000000000000000 --- a/mace/ops/sqrdiff_mean.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_SQRDIFF_MEAN_H_ -#define MACE_OPS_SQRDIFF_MEAN_H_ - -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/sqrdiff_mean.h" - -namespace mace { -namespace ops { - -template -class SqrDiffMeanOp : public Operator { - public: - SqrDiffMeanOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input0 = this->Input(INPUT0); - const Tensor *input1 = this->Input(INPUT1); - Tensor *output = this->Output(OUTPUT); - - return functor_(input0, input1, output, future); - } - - private: - kernels::SqrDiffMeanFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT0, INPUT1); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_SQRDIFF_MEAN_H_ diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc deleted file mode 100644 index eac886dd82b1c7ae515cf44e72d2bf1a1ce12508..0000000000000000000000000000000000000000 --- a/mace/ops/squeeze.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/squeeze.h" - -namespace mace { -namespace ops { - -void Register_Squeeze(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SqueezeOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - SqueezeOp); - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SqueezeOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - SqueezeOp); -#endif -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc index fba5a37d245ea1c878753a96d39c2bf820af071e..166d98688438c6ea216b00e692178a27a2413315 100644 --- a/mace/ops/squeeze_test.cc +++ b/mace/ops/squeeze_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "gmock/gmock.h" -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc deleted file mode 100644 index 7aa7c07eb407e35b36170c0b7784f001297415f1..0000000000000000000000000000000000000000 --- a/mace/ops/stack.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/stack.h" - -namespace mace { -namespace ops { - -void Register_Stack(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - StackOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - StackOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - StackOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - StackOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/stack.h b/mace/ops/stack.h deleted file mode 100644 index be25c0b079cf014eb171c2b4f311e038ac256892..0000000000000000000000000000000000000000 --- a/mace/ops/stack.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_STACK_H_ -#define MACE_OPS_STACK_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/stack.h" - -namespace mace { -namespace ops { - -template -class StackOp : public Operator { - public: - StackOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} - - MaceStatus Run(StatsFuture *future) override { - const std::vector &inputs = this->Inputs(); - Tensor *output = this->Output(OUTPUT); - return functor_(inputs, output, future); - } - - private: - kernels::StackFunctor functor_; - - protected: - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_STACK_H_ diff --git a/mace/ops/stack_test.cc b/mace/ops/stack_test.cc index 8cccb133026026e3b24799fb95684956277849d9..e55ff278d27d3edc04af653e4568e106a4d8538e 100644 --- a/mace/ops/stack_test.cc +++ b/mace/ops/stack_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc deleted file mode 100644 index 0f608b1722fc60603f4e6f0e1d95d9f6e57e1e69..0000000000000000000000000000000000000000 --- a/mace/ops/strided_slice.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/strided_slice.h" - -namespace mace { -namespace ops { - -void Register_StridedSlice(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - StridedSliceOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - StridedSliceOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - StridedSliceOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - StridedSliceOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/strided_slice.h b/mace/ops/strided_slice.h deleted file mode 100644 index 249dc3e9d07b7b59665faedc10cb7c320f1c9aea..0000000000000000000000000000000000000000 --- a/mace/ops/strided_slice.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_STRIDED_SLICE_H_ -#define MACE_OPS_STRIDED_SLICE_H_ - -#include "mace/core/operator.h" -#include "mace/kernels/strided_slice.h" - -namespace mace { -namespace ops { - -template -class StridedSliceOp : public Operator { - public: - StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, - OperatorBase::GetOptionalArg("begin_mask", 0), - OperatorBase::GetOptionalArg("end_mask", 0), - OperatorBase::GetOptionalArg("ellipsis_mask", 0), - OperatorBase::GetOptionalArg("new_axis_mask", 0), - OperatorBase::GetOptionalArg("shrink_axis_mask", 0), - OperatorBase::GetOptionalArg("slice", - false)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const Tensor *begin_indices = this->Input(BEGIN); - const Tensor *end_indices = this->Input(END); - const Tensor *strides = nullptr; - if (this->InputSize() > 3) { - strides = this->Input(STRIDES); - } - Tensor *output = this->Output(OUTPUT); - - return functor_(input, begin_indices, end_indices, strides, output, future); - } - - private: - kernels::StridedSliceFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_STRIDED_SLICE_H_ diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc index d975d7beb922f40e648ffbdd009091537b5425c7..c13a813cc8099c8e5d9a7782971d112ddd2aea89 100644 --- a/mace/ops/strided_slice_test.cc +++ b/mace/ops/strided_slice_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc deleted file mode 100644 index 73dcaf7b650dbd168bd5c74a38c3a8fbdc3a7318..0000000000000000000000000000000000000000 --- a/mace/ops/transpose.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/transpose.h" - -namespace mace { -namespace ops { - -void Register_Transpose(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - TransposeOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h deleted file mode 100644 index 91aa3365a3606b3f8899e4ca07141fba7011fc7d..0000000000000000000000000000000000000000 --- a/mace/ops/transpose.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_TRANSPOSE_H_ -#define MACE_OPS_TRANSPOSE_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/softmax.h" -#include "mace/kernels/transpose.h" - -namespace mace { - -template -class TransposeOp : public Operator { - public: - TransposeOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - dims_(OperatorBase::GetRepeatedArgs("dims")), - functor_(context, dims_) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - Tensor *output = this->Output(OUTPUT); - const std::vector &input_shape = input->shape(); - MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) || - (input_shape.size() == 2 && dims_.size() == 2), - "rank should be 2 or 4"); - std::vector output_shape; - for (size_t i = 0; i < dims_.size(); ++i) { - output_shape.push_back(input_shape[dims_[i]]); - } - MACE_RETURN_IF_ERROR(output->Resize(output_shape)); - return functor_(input, output, future); - } - - protected: - std::vector dims_; - kernels::TransposeFunctor functor_; - - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace mace - -#endif // MACE_OPS_TRANSPOSE_H_ diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc index 1e68a4a98b2a70084ec6f06511641fd20679add2..6d37b93c2de76f37e8c32fbffab68ec1112b398b 100644 --- a/mace/ops/transpose_benchmark.cc +++ b/mace/ops/transpose_benchmark.cc @@ -15,7 +15,7 @@ #include #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc index 76bfc57ad5b3ed5e431c3d39255caf81e39426c7..44ef0ec24a4e0703555cf0ec356138273c7f5e86 100644 --- a/mace/ops/transpose_test.cc +++ b/mace/ops/transpose_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/unstack.cc b/mace/ops/unstack.cc deleted file mode 100644 index 7b1c815bbc5eb29b98161b3096a88bfc2145ede7..0000000000000000000000000000000000000000 --- a/mace/ops/unstack.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/unstack.h" - -namespace mace { -namespace ops { - -void Register_Unstack(OperatorRegistryBase *op_registry) { - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Unstack") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - UnstackOp); - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Unstack") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), - UnstackOp); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/unstack.h b/mace/ops/unstack.h deleted file mode 100644 index 1c3d1764972f6f8dc40e7353a2445e1e0ee6421d..0000000000000000000000000000000000000000 --- a/mace/ops/unstack.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_UNSTACK_H_ -#define MACE_OPS_UNSTACK_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/unstack.h" - -namespace mace { -namespace ops { - -template -class UnstackOp : public Operator { - public: - UnstackOp(const OperatorDef &operator_def, OpKernelContext *context) - : Operator(operator_def, context), - functor_(context, OperatorBase::GetOptionalArg("axis", 0)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input = this->Input(INPUT); - const std::vector outputs = this->Outputs(); - return functor_(input, outputs, future); - } - - private: - kernels::UnstackFunctor functor_; - - protected: - MACE_OP_OUTPUT_TAGS(INPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_UNSTACK_H_ diff --git a/mace/ops/unstack_test.cc b/mace/ops/unstack_test.cc index 306c836242426612763ea11cd573803c6d358021..4c9774ff2ac3e3e4bc5c3e47f65f05e30dd9d96b 100644 --- a/mace/ops/unstack_test.cc +++ b/mace/ops/unstack_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc index c616a28072adc2634bea33628be3a45c1ac5779a..3b126f07b39e25a7f9e372ece47cf16f701c6f72 100644 --- a/mace/ops/winograd_convolution_benchmark.cc +++ b/mace/ops/winograd_convolution_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 3cd5ab92b7a5aa0def56ed83bb58847042b2fc20..1c82a18988a21bb4911ee004a55342041f1e1d21 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -14,7 +14,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -22,7 +22,7 @@ namespace mace { namespace ops { namespace test { -class WinogradConvlutionTest : public OpsTestBase {}; +class WinogradConvolutionTest : public OpsTestBase {}; namespace { @@ -134,42 +134,42 @@ void WinogradConvolution(const index_t batch, } } // namespace -TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) { +TEST_F(WinogradConvolutionTest, AlignedConvolutionM2) { WinogradConvolution(1, 32, 32, 3, 3, Padding::VALID, 2); WinogradConvolution(1, 32, 32, 3, 3, Padding::SAME, 2); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) { +TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2) { WinogradConvolution(1, 61, 67, 31, 37, Padding::VALID, 2); WinogradConvolution(1, 61, 67, 37, 31, Padding::SAME, 2); } -TEST_F(WinogradConvlutionTest, BatchConvolutionM2) { +TEST_F(WinogradConvolutionTest, BatchConvolutionM2) { WinogradConvolution(3, 64, 64, 32, 32, Padding::VALID, 2); WinogradConvolution(5, 61, 67, 37, 31, Padding::SAME, 2); } -TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) { +TEST_F(WinogradConvolutionTest, AlignedConvolutionM4) { WinogradConvolution(1, 32, 32, 3, 3, Padding::VALID, 4); WinogradConvolution(1, 32, 32, 3, 3, Padding::SAME, 4); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) { +TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) { WinogradConvolution(1, 61, 67, 31, 37, Padding::VALID, 4); WinogradConvolution(1, 61, 67, 37, 31, Padding::SAME, 4); } -TEST_F(WinogradConvlutionTest, BatchConvolutionM4) { +TEST_F(WinogradConvolutionTest, BatchConvolutionM4) { WinogradConvolution(3, 64, 64, 32, 32, Padding::VALID, 4); WinogradConvolution(5, 61, 67, 37, 31, @@ -284,42 +284,42 @@ void WinogradConvolutionWithPad(const index_t batch, } } // namespace -TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) { +TEST_F(WinogradConvolutionTest, AlignedConvolutionM2WithPad) { WinogradConvolutionWithPad(1, 32, 32, 32, 16, 1, 2); WinogradConvolutionWithPad(1, 32, 32, 32, 16, 2, 2); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) { +TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2WithPad) { WinogradConvolutionWithPad(1, 61, 67, 31, 37, 1, 2); WinogradConvolutionWithPad(1, 61, 67, 37, 31, 2, 2); } -TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) { +TEST_F(WinogradConvolutionTest, BatchConvolutionWithM2Pad) { WinogradConvolutionWithPad(3, 64, 64, 32, 32, 1, 2); WinogradConvolutionWithPad(5, 61, 67, 37, 31, 2, 2); } -TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) { +TEST_F(WinogradConvolutionTest, AlignedConvolutionM4WithPad) { WinogradConvolutionWithPad(1, 32, 32, 32, 16, 1, 4); WinogradConvolutionWithPad(1, 32, 32, 32, 16, 2, 4); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) { +TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4WithPad) { WinogradConvolutionWithPad(1, 61, 67, 31, 37, 1, 4); WinogradConvolutionWithPad(1, 61, 67, 37, 31, 2, 4); } -TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) { +TEST_F(WinogradConvolutionTest, BatchConvolutionWithM4Pad) { WinogradConvolutionWithPad(3, 64, 64, 32, 32, 1, 4); WinogradConvolutionWithPad(5, 61, 67, 37, 31, diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc deleted file mode 100644 index 62e86248136c3cd4b8f94ee305c700dcaa16277e..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_inverse_transform.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/winograd_inverse_transform.h" - -namespace mace { -namespace ops { - -void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry) { -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradInverseTransformOp); -#else - MACE_UNUSED(op_registry); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h deleted file mode 100644 index 548c889a2538b147eae895f24f7b844de5fc6e1c..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_inverse_transform.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ -#define MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ - -#include -#include -#include - -#include "mace/core/operator.h" -#include "mace/kernels/activation.h" -#include "mace/kernels/winograd_transform.h" - -namespace mace { -namespace ops { - -template -class WinogradInverseTransformOp : public Operator { - public: - WinogradInverseTransformOp(const OperatorDef &op_def, - OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - kernels::StringToActivationType( - OperatorBase::GetOptionalArg("activation", - "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f), - OperatorBase::GetOptionalArg("wino_block_size", 2)) {} - - MaceStatus Run(StatsFuture *future) override { - const std::vector &inputs = this->Inputs(); - Tensor *output_tensor = this->Output(OUTPUT); - return functor_(inputs, output_tensor, future); - } - - private: - kernels::WinogradInverseTransformFunctor functor_; - - protected: - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc deleted file mode 100644 index a4dab0ec1d1d1cacdd30c292c481834b86d35918..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_transform.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/winograd_transform.h" - -namespace mace { -namespace ops { - -void Register_WinogradTransform(OperatorRegistryBase *op_registry) { -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); - - MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") - .Device(DeviceType::GPU) - .TypeConstraint("T") - .Build(), - WinogradTransformOp); -#else - MACE_UNUSED(op_registry); -#endif // MACE_ENABLE_OPENCL -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h deleted file mode 100644 index 2274b6e8a8c29aa0a4d46cda6a344206055aa0fa..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_transform.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_WINOGRAD_TRANSFORM_H_ -#define MACE_OPS_WINOGRAD_TRANSFORM_H_ - -#include - -#include "mace/core/operator.h" -#include "mace/kernels/winograd_transform.h" - -namespace mace { -namespace ops { - -template -class WinogradTransformOp : public Operator { - public: - WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context) - : Operator(op_def, context), - functor_(context, - static_cast(OperatorBase::GetOptionalArg( - "padding", static_cast(VALID))), - OperatorBase::GetRepeatedArgs("padding_values"), - OperatorBase::GetOptionalArg( - "wino_block_size", 2)) {} - - MaceStatus Run(StatsFuture *future) override { - const Tensor *input_tensor = this->Input(INPUT); - Tensor *output_tensor = this->Output(OUTPUT); - - return functor_(input_tensor, output_tensor, future); - } - - private: - kernels::WinogradTransformFunctor functor_; - - protected: - MACE_OP_INPUT_TAGS(INPUT); - MACE_OP_OUTPUT_TAGS(OUTPUT); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_WINOGRAD_TRANSFORM_H_ diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index 9955c9abc35f63c02f7b5461da2cbc425657265f..5c21c9ad90fe171b4b2ce1d703f3fbdf654f083a 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/core/testing/test_benchmark.h" #include "mace/ops/ops_test_util.h" diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index 1609cd4d62cb780b61a46100cb9cbd9122722ddf..4b78900055d424321139867c3b3c22fde62d7ebe 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -74,10 +74,11 @@ message OperatorDef { repeated string output = 2; optional string name = 3; optional string type = 4; - repeated Argument arg = 5; - repeated OutputShape output_shape = 6; - repeated DataType output_type = 7; - repeated QuantizeActivationInfo quantize_info = 8; + optional int32 device_type = 5; + repeated Argument arg = 6; + repeated OutputShape output_shape = 7; + repeated DataType output_type = 8; + repeated QuantizeActivationInfo quantize_info = 9; repeated int32 mem_id = 10; @@ -121,6 +122,7 @@ message NetDef { repeated OperatorDef op = 1; repeated Argument arg = 2; repeated ConstTensor tensors = 3; + repeated string op_types = 4; // for mem optimization optional MemoryArena mem_arena = 10; diff --git a/mace/public/mace.h b/mace/public/mace.h index 313e1afb551fbffe6bc4c798296d89dcc08171fe..e9ab737ae5e3fa0582a12fb998f1000077121995 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -82,17 +82,42 @@ class RunMetadata { const char *MaceVersion(); -enum MaceStatus { - MACE_SUCCESS = 0, - MACE_INVALID_ARGS = 1, - MACE_OUT_OF_RESOURCES = 2 +class MaceStatus { + public: + enum Code { + MACE_SUCCESS = 0, + MACE_INVALID_ARGS = 1, + MACE_OUT_OF_RESOURCES = 2 + }; + + public: + MaceStatus(); + MaceStatus(const Code code); // NOLINT(runtime/explicit) + MaceStatus(const Code code, const std::string &information); + MaceStatus(const MaceStatus &); + MaceStatus(MaceStatus &&); + MaceStatus &operator=(const MaceStatus &); + MaceStatus &operator=(const MaceStatus &&); + ~MaceStatus(); + Code code() const; + std::string information() const; + + bool operator==(const MaceStatus &other) const; + bool operator!=(const MaceStatus &other) const; + + private: + class Impl; + std::unique_ptr impl_; }; -#define MACE_RETURN_IF_ERROR(stmt) \ + +#define MACE_RETURN_IF_ERROR(stmt) \ { \ MaceStatus status = (stmt); \ - if (status != MACE_SUCCESS) { \ - VLOG(0) << "Mace runtime failure: " << __FILE__ << ":" << __LINE__; \ + if (status != MaceStatus::MACE_SUCCESS) { \ + VLOG(0) << "Mace runtime failure: " \ + << __FILE__ << ":" << __LINE__ << ". " \ + << status.information(); \ return status; \ } \ } @@ -112,9 +137,9 @@ class MACE_API GPUContextBuilder { GPUContextBuilder(); ~GPUContextBuilder(); GPUContextBuilder(const GPUContextBuilder &) = delete; - GPUContextBuilder(const GPUContextBuilder &&) = delete; + GPUContextBuilder(GPUContextBuilder &&) = delete; GPUContextBuilder &operator=(const GPUContextBuilder &) = delete; - GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete; + GPUContextBuilder &operator=(GPUContextBuilder &&) = delete; /// \brief Set internal storage factory to store internal data. /// @@ -167,7 +192,7 @@ class MACE_API MaceEngineConfig { /// /// Just use one GPUContext for multiple models run on GPU. /// \param context created use GPUContextBuilder - /// \return MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failed. MaceStatus SetGPUContext(std::shared_ptr context); /// \brief Set GPU hints, currently only supports Adreno GPU. @@ -177,7 +202,7 @@ class MACE_API MaceEngineConfig { /// /// \param perf_hint performance hint /// \param priority_hint priority hint - /// \return MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failed. MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); @@ -199,7 +224,7 @@ class MACE_API MaceEngineConfig { /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's /// suggested to use AFFINITY_NONE to use all cores. /// \param use_gemmlowp use gemmlowp for quantized inference - /// \return MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failed. MaceStatus SetCPUThreadPolicy(int num_threads_hint, CPUAffinityPolicy policy, bool use_gemmlowp = false); @@ -273,8 +298,9 @@ class MACE_API MaceEngine { /// \param output_nodes[in]: the array of output nodes' name /// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object -/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, -/// MACE_OUT_OF_RESOURCES for resources is out of range. +/// \return MaceStatus::MACE_SUCCESS for success, +/// MaceStatus::MACE_INVALID_ARGS for wrong arguments, +/// MaceStatus::MACE_OUT_OF_RESOURCES for resources is out of range. MACE_API MaceStatus CreateMaceEngineFromProto( const std::vector &model_pb, const std::string &model_data_file, diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index a89e3abdb1e4a75fdf3ee5489439cb7d89cbfcfd..92a6b12d2cdea5cc908d7427ca5e698a2a3f2b41 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -214,6 +214,9 @@ def main(unused_args): for arg in cpu_graph_def.arg: if arg.name not in output_graph_arg_names: output_graph_def.arg.extend(arg) + for op_type in cpu_graph_def.op_types: + if op_type not in output_graph_def.op_types: + output_graph_def.op_types.extend([op_type]) print("Merge done") else: option.device = device_type_map[FLAGS.runtime] diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index a0deec6334ad261857bd1af879ba47edd0d8a8c2..8ca694dc3999b509bfef4ec93457e2683f152d5a 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -94,7 +94,6 @@ MaceSupportedOps = [ 'Dequantize', 'Eltwise', 'ExpandDims', - 'FoldedBatchNorm', 'Fill', 'FullyConnected', 'Gather', diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py index 5aabfa4b8bef5f3278ee1a1372a9ed3dc5b3dd6e..374d10737b6ea5bfc1a46902cc8442067f72703d 100644 --- a/mace/python/tools/converter_tool/caffe_converter.py +++ b/mace/python/tools/converter_tool/caffe_converter.py @@ -487,7 +487,7 @@ class CaffeConverter(base_converter.ConverterInterface): def convert_folded_batchnorm(self, caffe_op): op = self.convert_general_op(caffe_op) - op.type = MaceOp.FoldedBatchNorm.name + op.type = MaceOp.BatchNorm.name scale_op = None for consumer in self._caffe_net.get_consumers(caffe_op.layer.top[0]): diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py index 5320c804f18b8840c6e0e490a449ee2d616ad551..e62affaf9d9fe392fae2f3591cc910c05c0aeffa 100644 --- a/mace/python/tools/converter_tool/shape_inference.py +++ b/mace/python/tools/converter_tool/shape_inference.py @@ -37,7 +37,7 @@ class ShapeInference(object): MaceOp.Deconv2D.name: self.infer_shape_deconv, MaceOp.DepthwiseConv2d.name: self.infer_shape_conv_pool_shape, MaceOp.Eltwise.name: self.infer_shape_general, - MaceOp.FoldedBatchNorm.name: self.infer_shape_general, + MaceOp.BatchNorm.name: self.infer_shape_general, MaceOp.AddN.name: self.infer_shape_general, MaceOp.Activation.name: self.infer_shape_general, MaceOp.Pooling.name: self.infer_shape_conv_pool_shape, diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index 56f2c3a0ee5e845edbc514a5770e2a0a0a221ba0..68e5ccb537cea82497647afbfa1a528068f34244 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -531,7 +531,7 @@ class TensorflowConverter(base_converter.ConverterInterface): def convert_fused_batchnorm(self, tf_op): op = self.convert_general_op(tf_op) - op.type = MaceOp.FoldedBatchNorm.name + op.type = MaceOp.BatchNorm.name is_training = tf_op.get_attr(tf_is_training_str) assert is_training is False, 'Only support batch normalization ' \ diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 08f8fcaef9cac5aae821c3ac9af7b9ddb5f4148a..fe5c02ce940e723fe62818652dcddda5e64f7131 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -353,7 +353,7 @@ class Transformer(base_converter.ConverterInterface): and consumer_op.input[1] in self._consts \ and len(self._consts[consumer_op.input[1]].dims) == 1: print("Fold batchnorm: %s(%s)" % (op.name, op.type)) - consumer_op.type = MaceOp.FoldedBatchNorm.name + consumer_op.type = MaceOp.BatchNorm.name consumer_op.input[:] = [op.input[0], op.input[1], consumer_op.input[1]] @@ -544,7 +544,7 @@ class Transformer(base_converter.ConverterInterface): if (op.type == MaceOp.Conv2D.name) \ and self.consumer_count(op.output[0]) == 1: consumer_op = self._consumers[op.output[0]][0] - if consumer_op.type == MaceOp.FoldedBatchNorm.name: + if consumer_op.type == MaceOp.BatchNorm.name: print("Fold conv and bn: %s(%s)" % (op.name, op.type)) filter = self._consts[op.input[1]] scale = self._consts[consumer_op.input[1]] @@ -584,7 +584,7 @@ class Transformer(base_converter.ConverterInterface): if (op.type == MaceOp.Deconv2D.name) \ and self.consumer_count(op.output[0]) == 1: consumer_op = self._consumers[op.output[0]][0] - if consumer_op.type == MaceOp.FoldedBatchNorm.name: + if consumer_op.type == MaceOp.BatchNorm.name: print("Fold deconv and bn: %s(%s)" % (op.name, op.type)) filter = self._consts[op.input[1]] scale = self._consts[consumer_op.input[1]] @@ -627,7 +627,7 @@ class Transformer(base_converter.ConverterInterface): if op.type == MaceOp.DepthwiseConv2d.name \ and self.consumer_count(op.output[0]) == 1: consumer_op = self._consumers[op.output[0]][0] - if consumer_op.type == MaceOp.FoldedBatchNorm.name: + if consumer_op.type == MaceOp.BatchNorm.name: print("Fold depthwise conv and bn: %s(%s)" % (op.name, op.type)) filter = self._consts[op.input[1]] @@ -989,7 +989,7 @@ class Transformer(base_converter.ConverterInterface): or op.type == MaceOp.Deconv2D.name or op.type == MaceOp.DepthwiseConv2d.name or op.type == MaceOp.FullyConnected.name - or op.type == MaceOp.FoldedBatchNorm.name + or op.type == MaceOp.BatchNorm.name or op.type == MaceOp.WinogradInverseTransform.name) \ and len(self._consumers.get(op.output[0], [])) == 1: consumer_op = self._consumers[op.output[0]][0] @@ -1450,7 +1450,7 @@ class Transformer(base_converter.ConverterInterface): if op.input[1] in self._consts \ and len(self._consts[op.input[1]].dims) == 1: self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.FoldedBatchNorm.name: + elif op.type == MaceOp.BatchNorm.name: self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) if len(op.input) >= 4: @@ -1712,6 +1712,14 @@ class Transformer(base_converter.ConverterInterface): ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) + def add_op_types(self): + net = self._model + op_types = set() + for op in net.op: + op_types.add(op.type) + for op_type in op_types: + net.op_types.extend([op_type]) + def sort_by_execution(self): print("Sort by execution") net = self._model @@ -1728,6 +1736,8 @@ class Transformer(base_converter.ConverterInterface): del net.op[:] net.op.extend(sorted_nodes) + self.add_op_types() + print("Final ops:") for op in net.op: print("%s (%s): %s" % (op.name, op.type, [ diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2 index ab400151d4576aca85ece315020998027f20d62f..3e183f15d24e745fddf5f7a304cdbb125745955c 100644 --- a/mace/python/tools/mace_engine_factory.h.jinja2 +++ b/mace/python/tools/mace_engine_factory.h.jinja2 @@ -60,7 +60,7 @@ std::map model_name_map { /// \param output_nodes[in]: the array of output nodes' name /// \param config[in]: configurations for MaceEngine. /// \param engine[out]: output MaceEngine object -/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, +/// \return MaceStatus::MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments, /// MACE_OUT_OF_RESOURCES for resources is out of range. MaceStatus CreateMaceEngineFromCode( const std::string &model_name, diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 3f4ba1c4f5d907352a0cee9bca719fa29be08768..ec1ba28451949e467eb755db4bee137e52658471 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -122,6 +122,12 @@ void CreateTensors(NetDef *net_def) { {% endfor %} } +void CreateOpTypes(NetDef *net_def) { + {% for op_type in net.op_types %} + net_def->add_op_types({{ op_type|tojson }}); + {% endfor %} +} + {% if net.mem_arena.mem_block|length != 0 %} void CreateMemoryArena(mace::MemoryArena *mem_arena) { mem_arena->mutable_mem_block()->Reserve({{ net.mem_arena.mem_block|length }}); @@ -162,6 +168,9 @@ const std::shared_ptr CreateNet() { {% if net.output_info | length > 0 %} CreateOutputInfo(net_def.get()); {% endif %} + {% if net.op_types|length > 0 %} + CreateOpTypes(net_def.get()); + {% endif %} return net_def; } diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 7f768adc7e2420c7163e862b6be244983b0a1791..0f8d1f492b6726e528900add6fafed661cb69df4 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -15,7 +15,7 @@ #include #include // NOLINT(build/c++11) -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" @@ -332,6 +332,10 @@ void MaceRunFunc(const int in_out_size) { OutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } + for (int i = 0; i < net_def->op_size(); ++i) { + net_def->add_op_types(net_def->op(i).type()); + } + MaceEngineConfig config(DeviceType::GPU); MaceEngine engine(config); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 945758b947c71baa1a6550dc1cf7a076ffebad65..54dd99b78db05106ca4d526c35731d7c5e3d6222 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -15,7 +15,7 @@ #include -#include "mace/core/operator.h" +#include "mace/core/op_def_registry.h" #include "mace/kernels/conv_pool_2d_util.h" #include "mace/ops/ops_test_util.h" #include "mace/public/mace.h" @@ -334,6 +334,10 @@ void MaceRun(const int in_out_size, info->set_name(output_names[i]); } + for (int i = 0; i < net_def->op_size(); ++i) { + net_def->add_op_types(net_def->op(i).type()); + } + MaceEngineConfig config(DeviceType::GPU); MaceEngine engine(config); diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 79b4c5710de6c4d47126cf6a8468ec071a80163a..08ffdebef220533e79ceb54fbc97b019aa8ae8ef 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -215,7 +215,7 @@ bool RunModel(const std::string &model_name, FLAGS_omp_num_threads, static_cast(FLAGS_cpu_affinity_policy), true); - if (status != MACE_SUCCESS) { + if (status != MaceStatus::MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } #ifdef MACE_ENABLE_OPENCL @@ -274,9 +274,9 @@ bool RunModel(const std::string &model_name, #endif int64_t t1 = NowMicros(); - if (create_engine_status != MACE_SUCCESS) { + if (create_engine_status != MaceStatus::MACE_SUCCESS) { LOG(ERROR) << "Create engine runtime error, retry ... errcode: " - << create_engine_status; + << create_engine_status.information(); } else { init_millis = (t1 - t0) / 1000.0; LOG(INFO) << "Total init latency: " << init_millis << " ms"; @@ -324,9 +324,9 @@ bool RunModel(const std::string &model_name, while (true) { int64_t t3 = NowMicros(); MaceStatus warmup_status = engine->Run(inputs, &outputs); - if (warmup_status != MACE_SUCCESS) { + if (warmup_status != MaceStatus::MACE_SUCCESS) { LOG(ERROR) << "Warmup runtime error, retry ... errcode: " - << warmup_status; + << warmup_status.information(); do { #ifdef MODEL_GRAPH_FORMAT_CODE create_engine_status = @@ -345,7 +345,7 @@ bool RunModel(const std::string &model_name, config, &engine); #endif - } while (create_engine_status != MACE_SUCCESS); + } while (create_engine_status != MaceStatus::MACE_SUCCESS); } else { int64_t t4 = NowMicros(); warmup_millis = (t4 - t3) / 1000.0; @@ -364,9 +364,9 @@ bool RunModel(const std::string &model_name, while (true) { int64_t t0 = NowMicros(); run_status = engine->Run(inputs, &outputs); - if (run_status != MACE_SUCCESS) { + if (run_status != MaceStatus::MACE_SUCCESS) { LOG(ERROR) << "Mace run model runtime error, retry ... errcode: " - << run_status; + << run_status.information(); do { #ifdef MODEL_GRAPH_FORMAT_CODE create_engine_status = @@ -385,7 +385,7 @@ bool RunModel(const std::string &model_name, config, &engine); #endif - } while (create_engine_status != MACE_SUCCESS); + } while (create_engine_status != MaceStatus::MACE_SUCCESS); } else { int64_t t1 = NowMicros(); total_run_duration += (t1 - t0); diff --git a/mace/utils/BUILD b/mace/utils/BUILD index 283efa490e0a54ed48df9a61a289e5b67bf503f8..6d6feb1a52736368198b11700de9493f65af44a6 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD @@ -13,6 +13,7 @@ cc_library( name = "utils", srcs = [ "logging.cc", + "status.cc", "string_util.cc", ], hdrs = glob([ diff --git a/mace/utils/status.cc b/mace/utils/status.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd8dd9daecdf8e28bc8cb8d09732327b23f64e11 --- /dev/null +++ b/mace/utils/status.cc @@ -0,0 +1,88 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/public/mace.h" + +namespace mace { + +class MaceStatus::Impl { + public: + explicit Impl(const Code code): code_(code), information_("") {} + Impl(const Code code, const std::string &informaton) + : code_(code), information_(informaton) {} + ~Impl() = default; + + void SetCode(const Code code) { code_ = code; } + Code code() const { return code_; } + void SetInformation(const std::string &info) { information_ = info; } + std::string information() const { return Code2Str() + ": " + information_; } + + private: + std::string Code2Str() const { + switch (code_) { + case MaceStatus::MACE_SUCCESS: + return "Success"; + case MaceStatus::MACE_INVALID_ARGS: + return "Invalid Arguments"; + case MaceStatus::MACE_OUT_OF_RESOURCES: + return "Out of resources"; + default: + return ""; + } + } + + private: + MaceStatus::Code code_; + std::string information_; +}; + +MaceStatus::MaceStatus() + : impl_(new MaceStatus::Impl(MaceStatus::MACE_SUCCESS)) {} +MaceStatus::MaceStatus(const Code code) : impl_(new MaceStatus::Impl(code)) {} +MaceStatus::MaceStatus(const Code code, const std::string &information) + : impl_(new MaceStatus::Impl(code, information)) {} +MaceStatus::MaceStatus(const MaceStatus &other) + : impl_(new MaceStatus::Impl(other.code(), other.information())) {} +MaceStatus::MaceStatus(MaceStatus &&other) + : impl_(new MaceStatus::Impl(other.code(), other.information())) {} +MaceStatus::~MaceStatus() = default; + +MaceStatus& MaceStatus::operator=(const MaceStatus &other) { + impl_->SetCode(other.code()); + impl_->SetInformation(other.information()); + return *this; +} +MaceStatus& MaceStatus::operator=(const MaceStatus &&other) { + impl_->SetCode(other.code()); + impl_->SetInformation(other.information()); + return *this; +} + +MaceStatus::Code MaceStatus::code() const { + return impl_->code(); +} + +std::string MaceStatus::information() const { + return impl_->information(); +} + +bool MaceStatus::operator==(const MaceStatus &other) const { + return other.code() == impl_->code(); +} + +bool MaceStatus::operator!=(const MaceStatus &other) const { + return other.code() != impl_->code(); +} + +} // namespace mace diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 12138cad8ecdc2fba4d9742faa8079c5519416bc..237febcce69f9d849ad3431c502295273bea89b3 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -33,8 +33,8 @@ namespace mace { CLASSNAME &operator=(const CLASSNAME &) = delete #endif -#ifndef MACE_VIRTUAL_EMPTY_DESTRUCTOR -#define MACE_VIRTUAL_EMPTY_DESTRUCTOR(CLASSNAME) \ +#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR +#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \ public: \ virtual ~CLASSNAME() {} #endif