diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index e0dac730639276dbd30bf210b466c57d9940feaf..98807b6789b07355da7fe02260b788f02f36b9fc 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } } diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index 4f6045d8f75d20d48aa450f4c5266a7669a0620d..2cb1379b55f01a3c10dc8d9c83c72cc1e56051b7 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -96,6 +96,43 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) #undef MACE_GET_REPEATED_ARGUMENT_FUNC +#define MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, T, fieldname) \ + template<> \ + void SetProtoArg(Def *def, \ + const std::string &arg_name, \ + const T &value) { \ + int size = def->arg_size(); \ + for (int i = 0; i < size; ++i) { \ + auto arg = def->mutable_arg(i); \ + if (arg->name() == arg_name) { \ + VLOG(3) << "Update old argument value from " \ + << arg->fieldname() << " to " \ + << value << " for " << arg_name; \ + arg->set_##fieldname(value); \ + return; \ + } \ + } \ + VLOG(3) << "Add new argument " << arg_name << "(name: " \ + << arg_name << ", value: " << value << ")"; \ + auto arg = def->add_arg(); \ + arg->set_name(arg_name); \ + arg->set_##fieldname(value); \ + } + +#define MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(Def) \ + MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f) \ + MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i) \ + MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i) \ + MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i) + +MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef) +MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef) +#undef MACE_SET_OPTIONAL_ARGUMENT_FUNC + +const std::string OutputMemoryTypeTagName() { + static const char *kOutputMemTypeArgName = "output_mem_type"; + return kOutputMemTypeArgName; +} bool IsQuantizedModel(const NetDef &net_def) { return diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h index 9d2cd243f669c3fc03907225f7b0b42aa71326d2..e3a6319a18251f462d624f69f40f7f41f6e860ce 100644 --- a/mace/core/arg_helper.h +++ b/mace/core/arg_helper.h @@ -55,6 +55,18 @@ class ProtoArgHelper { std::map arg_map_; }; +template +void SetProtoArg(OperatorDef *op_def, + const std::string &arg_name, + const T&value); + +template +void SetProtoArg(NetDef *op_def, + const std::string &arg_name, + const T&value); + +const std::string OutputMemoryTypeTagName(); + bool IsQuantizedModel(const NetDef &def); } // namespace mace diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 7f86d0eb426d5c5834f9d498f9554c73a0602df0..b781682f618e79149b99dad5002ac68031989362 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -33,7 +33,7 @@ namespace mace { bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) { static const std::unordered_set kReuseOp = { - "Reshape", "Identity", "Squeeze" + "Reshape", "Identity", "Squeeze", "ExpandDims" }; return kReuseOp.count(op_type) == 1; } @@ -124,8 +124,10 @@ void MemoryOptimizer::Optimize( op_def->output_type_size()); DataType dt; - bool has_data_format = ProtoArgHelper::GetOptionalArg( - *op_def, "has_data_format", 0) != 0; + DataFormat data_format = static_cast( + ProtoArgHelper::GetOptionalArg( + *op_def, "data_format", + static_cast(DataFormat::NONE))); int output_size = op_def->output_size(); for (int i = 0; i < output_size; ++i) { if (i < op_def->output_type_size()) { @@ -209,7 +211,7 @@ void MemoryOptimizer::Optimize( mem_ref_count_[best_mem_id] = 1; } tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id, - dt, has_data_format)); + dt, data_format)); } } diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h index 986c5450280184990b426b18d99b886ee6f8fcac..b4e635f54f8c1e74328803793a58ff20ceeefbf0 100644 --- a/mace/core/memory_optimizer.h +++ b/mace/core/memory_optimizer.h @@ -22,6 +22,7 @@ #include #include "mace/proto/mace.pb.h" +#include "mace/port/port.h" #include "mace/core/types.h" namespace mace { @@ -81,10 +82,10 @@ class MemoryOptimizer { struct TensorMemInfo { int mem_id; DataType data_type; - bool has_data_format; + DataFormat data_format; - TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) : - mem_id(mem_id), data_type(data_type), has_data_format(has_data_format) + TensorMemInfo(int mem_id, DataType data_type, DataFormat data_format) : + mem_id(mem_id), data_type(data_type), data_format(data_format) {} }; diff --git a/mace/core/net.cc b/mace/core/net.cc index a10d96bb560b2a145146bcffa88e2b4e045f0e10..8c301dc728f0af53137023f4d019e9a89cf3e6ce 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -31,99 +31,8 @@ #include "mace/utils/memory.h" #include "mace/utils/timer.h" -#ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_util.h" -#endif // MACE_ENABLE_OPENCL - namespace mace { -namespace { -struct InternalOutputInfo { - InternalOutputInfo(const MemoryType mem_type, - const DataType dtype, - const DataFormat data_format, - const std::vector &shape, - int op_idx) - : mem_type(mem_type), dtype(dtype), data_format(data_format), - shape(shape), op_idx(op_idx) {} - - MemoryType mem_type; // transformed memory type - DataType dtype; - DataFormat data_format; - std::vector shape; // tensor shape - int op_idx; // operation which generate the tensor -}; - -#ifdef MACE_ENABLE_OPENCL -std::string TransformedName(const std::string &input_name, - const mace::MemoryType mem_type) { - std::stringstream ss; - ss << input_name << "_mem_type_" << mem_type; - return ss.str(); -} - -bool TransformRequiredOp(const std::string &op_type) { - static const std::unordered_set kNoTransformOp = { - "Shape", "InferConv2dShape" - }; - return kNoTransformOp.count(op_type) == 0; -} -#endif // MACE_ENABLE_OPENCL - -} // namespace - -std::unique_ptr SerialNet::CreateOperation( - const OpRegistryBase *op_registry, - OpConstructContext *construct_context, - std::shared_ptr op_def, - bool has_data_format, - bool is_quantize_model) { - // Create the Operation - DeviceType target_device_type = target_device_->device_type(); - DeviceType device_type = DeviceType::CPU; - construct_context->set_device(cpu_device_.get()); - construct_context->set_operator_def(op_def); - construct_context->set_output_mem_type(MemoryType::CPU_BUFFER); - // Get available devices - auto available_devices = - op_registry->AvailableDevices(op_def->type(), construct_context); - // Find the device type to run the op. - // If the target_device_type in available devices, use target_device_type, - // otherwise, fallback to CPU device. - for (auto device : available_devices) { - if (device == target_device_type) { - device_type = target_device_type; - construct_context->set_device(target_device_); - if (target_device_->device_type() == DeviceType::GPU) { - construct_context->set_output_mem_type(MemoryType::GPU_IMAGE); - } - break; - } - } - op_def->set_device_type(device_type); - - // transpose output shape if run on CPU (default format is NHWC) - if (!is_quantize_model && device_type == DeviceType::CPU && - op_def->output_shape_size() == op_def->output_size()) { - for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { - if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) { - // NHWC -> NCHW - std::vector output_shape = - TransposeShape( - std::vector( - op_def->output_shape(out_idx).dims().begin(), - op_def->output_shape(out_idx).dims().end()), - {0, 3, 1, 2}); - for (int i = 0; i < 4; ++i) { - op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]); - } - } - } - } - - return op_registry->CreateOperation(construct_context, device_type); -} - SerialNet::SerialNet(const OpRegistryBase *op_registry, const NetDef *net_def, Workspace *ws, @@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, target_device->cpu_runtime()->policy(), &target_device->cpu_runtime()->thread_pool())) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); - // quantize model flag - bool is_quantize_model = IsQuantizedModel(*net_def); - // Tensor Shape map - std::unordered_map> tensor_shape_map; - for (auto &op : net_def->op()) { - if (op.output_size() != op.output_shape_size()) { - continue; - } - for (int i = 0; i < op.output_size(); ++i) { - tensor_shape_map[op.output(i)] = std::vector( - op.output_shape(i).dims().begin(), - op.output_shape(i).dims().end()); - } - } - for (auto &tensor : net_def->tensors()) { - tensor_shape_map[tensor.name()] = - std::vector(tensor.dims().begin(), tensor.dims().end()); - } - bool has_data_format = false; - if (target_device_->device_type() == DeviceType::CPU) { - for (auto &input_info : net_def->input_info()) { - std::vector input_shape = - std::vector(input_info.dims().begin(), - input_info.dims().end()); - // update tensor shape map - tensor_shape_map[input_info.name()] = input_shape; - // Only could be NONE or NHWC - DataFormat input_data_format = static_cast( - input_info.data_format()); - has_data_format = has_data_format || - (input_data_format != DataFormat::DF_NONE); - if (!is_quantize_model && input_data_format == DataFormat::NHWC && - input_info.dims_size() == 4) { - // NHWC -> NCHW - input_shape = - TransposeShape(input_shape, {0, 3, 1, 2}); - } - } - } #ifdef MACE_ENABLE_OPENCL - // output tensor : related information - std::unordered_map output_map; // used for memory optimization std::unordered_map output_mem_map; - std::unordered_set transformed_set; - // add input information - MemoryType target_mem_type; - // default data format of output tensor - DataFormat default_output_df = DataFormat::DF_NONE; - if (target_device_->device_type() == DeviceType::GPU) { - target_mem_type = MemoryType::GPU_BUFFER; - for (auto &input_info : net_def->input_info()) { - DataFormat input_data_format = static_cast( - input_info.data_format()); - has_data_format = input_data_format != DataFormat::DF_NONE; - std::vector input_shape = - std::vector(input_info.dims().begin(), - input_info.dims().end()); - // update tensor shape map - tensor_shape_map[input_info.name()] = input_shape; - output_map.emplace(input_info.name(), InternalOutputInfo( - target_mem_type, DataType::DT_FLOAT, input_data_format, - input_shape, -1)); - } - default_output_df = - has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE; - } #endif // MACE_ENABLE_OPENCL - OpConstructContext construct_context(ws_, &tensor_shape_map); + OpConstructContext construct_context(ws_); for (int idx = 0; idx < net_def->op_size(); ++idx) { std::shared_ptr op_def(new OperatorDef(net_def->op(idx))); // Create operation - auto op = CreateOperation(op_registry, - &construct_context, - op_def, - has_data_format, - is_quantize_model); -#ifdef MACE_ENABLE_OPENCL - // Add input transform operation if necessary - if (target_device_->device_type() == DeviceType::GPU) { - // the outputs' memory type of the operation - MemoryType out_mem_type = construct_context.output_mem_type(); - int input_size = op_def->input_size(); - // if op is memory-unused op, no transformation - if (TransformRequiredOp(op_def->type())) { - for (int i = 0; i < input_size; ++i) { - if (output_map.count(op_def->input(i)) == 1) { - // if op is memory-reuse op, no transformation - if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) { - out_mem_type = output_map.at(op_def->input(i)).mem_type; - break; - } - // check whether to do transform - MemoryType wanted_in_mem_type = - construct_context.GetInputMemType(i); - DataType wanted_in_dt = construct_context.GetInputDataType(i); - if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type - || output_map.at(op_def->input(i)).dtype != wanted_in_dt) { - auto t_input_name = TransformedName(op_def->input(i), - wanted_in_mem_type); - auto &output_info = output_map.at(op_def->input(i)); - // check whether the tensor has been transformed - if (transformed_set.count(t_input_name) == 0) { - VLOG(1) << "Add Transform operation " << op_def->name() - << " to transform tensor " - << op_def->input(i) << "', from memory type " - << output_info.mem_type << " to " - << wanted_in_mem_type - << ", from Data Type " << output_info.dtype << " to " - << wanted_in_dt << ". with data format " - << output_info.data_format; - std::string input_name = op_def->input(i); - op_def->set_input(i, t_input_name); - auto input_shape = output_info.shape; - if (output_info.mem_type == MemoryType::CPU_BUFFER && - output_info.data_format == DataFormat::NCHW && - input_shape.size() == 4) { - // NCHW -> NHWC - input_shape = - TransposeShape(input_shape, - {0, 2, 3, 1}); - } - auto transform_op_def = OpenCLUtil::CreateTransformOpDef( - input_name, input_shape, t_input_name, wanted_in_dt, - construct_context.GetInputOpenCLBufferType(i), - wanted_in_mem_type, has_data_format); - OpConstructContext t_construct_context(ws_); - auto transform_op = CreateOperation( - op_registry, - &t_construct_context, - transform_op_def, - has_data_format); - operators_.emplace_back(std::move(transform_op)); - transformed_set.insert(t_input_name); - output_mem_map[t_input_name] = wanted_in_mem_type; - // where to do graph reference count. - mem_optimizer->UpdateTensorRef(transform_op_def.get()); - } else { - op_def->set_input(i, t_input_name); - } - } - } else { - MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr - && ws_->GetTensor(op_def->input(i))->is_weight(), - "Tensor ", op_def->input(i), " of ", - op_def->name(), " not allocated"); - } - } - } - // update the map : output_tensor -> Operation - for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { - DataType dt; - if (op_def->output_type_size() == op_def->output_size()) { - dt = op_def->output_type(out_idx); - } else { - dt = static_cast( - ProtoArgHelper::GetOptionalArg( - *op_def, "T", static_cast(DataType::DT_FLOAT))); - } - output_mem_map[op_def->output(out_idx)] = out_mem_type; - output_map.emplace( - op_def->output(out_idx), - InternalOutputInfo( - out_mem_type, - dt, - default_output_df, - op_def->output_shape().empty() ? - std::vector() : - std::vector( - op_def->output_shape(out_idx).dims().begin(), - op_def->output_shape(out_idx).dims().end()), - static_cast(operators_.size()))); - } + auto op_device_type = static_cast(op_def->device_type()); + if (op_device_type == target_device_->device_type()) { + construct_context.set_device(target_device_); + } else if (op_device_type == DeviceType::CPU) { + construct_context.set_device(cpu_device_.get()); + } else { + LOG(FATAL) << "Encounter unexpected error: " + << op_device_type << " vs " << target_device_->device_type(); } -#endif // MACE_ENABLE_OPENCL + construct_context.set_operator_def(op_def); + + auto op = op_registry->CreateOperation(&construct_context, + op_device_type); operators_.emplace_back(std::move(op)); // where to do graph reference count. mem_optimizer->UpdateTensorRef(op_def.get()); - } #ifdef MACE_ENABLE_OPENCL - // Transform the output tensor if necessary - if (target_device_->device_type() == DeviceType::GPU) { - for (auto &output_info : net_def->output_info()) { - auto &internal_output_info = output_map.at(output_info.name()); - if ((internal_output_info.mem_type != target_mem_type && - internal_output_info.mem_type != MemoryType::CPU_BUFFER) || - internal_output_info.dtype != output_info.data_type()) { - VLOG(1) << "Add Transform operation to transform output tensor '" - << output_info.name() << "', from memory type " - << internal_output_info.mem_type - << " to " << target_mem_type - << ", from Data Type " << internal_output_info.dtype - << " to " << output_info.data_type(); - std::string t_output_name = TransformedName(output_info.name(), - target_mem_type); - auto output_op_def = - operators_[internal_output_info.op_idx]->operator_def(); - int output_size = output_op_def->output_size(); - for (int i = 0; i < output_size; ++i) { - if (output_op_def->output(i) == output_info.name()) { - output_op_def->set_output(i, t_output_name); - // update the output : mem_type map - output_mem_map[t_output_name] = output_mem_map[output_info.name()]; - output_mem_map[output_info.name()] = target_mem_type; - } - } - bool output_has_data_format = - static_cast(output_info.data_format()); - auto transform_op_def = OpenCLUtil::CreateTransformOpDef( - t_output_name, - internal_output_info.shape, - output_info.name(), - output_info.data_type(), - OpenCLBufferType::IN_OUT_CHANNEL, - target_mem_type, - output_has_data_format); - auto transform_op = CreateOperation( - op_registry, - &construct_context, - transform_op_def, - output_has_data_format); - operators_.emplace_back(std::move(transform_op)); - // where to do graph reference count. - mem_optimizer->UpdateTensorRef(transform_op_def.get()); + if (target_device_->device_type() == DeviceType::GPU) { + // update the map : output_tensor -> MemoryType + MemoryType out_mem_type = + static_cast( + ProtoArgHelper::GetOptionalArg( + net_def->op(idx), OutputMemoryTypeTagName(), + static_cast(MemoryType::CPU_BUFFER))); + for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { + output_mem_map[op_def->output(out_idx)] = out_mem_type; } } - } #endif // MACE_ENABLE_OPENCL + } // Update output tensor reference for (auto &output_info : net_def->output_info()) { mem_optimizer->UpdateTensorRef(output_info.name()); diff --git a/mace/core/net.h b/mace/core/net.h index 788eb611a54158791f988d446153b4b50ef8a59e..18ec5134549ddf2a9fa62139034bb051e0afd64e 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -54,14 +54,6 @@ class SerialNet : public NetBase { MaceStatus Run(RunMetadata *run_metadata = nullptr) override; - private: - std::unique_ptr CreateOperation( - const OpRegistryBase *op_registry, - OpConstructContext *construct_context, - std::shared_ptr op_def, - bool has_data_format, - bool is_quantize_model = false); - protected: Workspace *ws_; Device *target_device_; diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c7bb86517a96f011955cfd3b98a4f3b0050f9cb --- /dev/null +++ b/mace/core/net_def_adapter.cc @@ -0,0 +1,652 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/net_def_adapter.h" + +#include +#include + +#include "mace/core/operator.h" +#include "mace/utils/math.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_util.h" +#endif // MACE_ENABLE_OPENCL +namespace mace { + +namespace { +DataFormat GetDefaultDataFormat(DeviceType device_type, + bool is_quantized_model) { + if (device_type == CPU) { + if (is_quantized_model) { + return DataFormat::NHWC; + } else { + return DataFormat::NCHW; + } + } else if (device_type == GPU) { + return DataFormat::NHWC; + } else { + LOG(FATAL) << "MACE do not support the device " << device_type; + return DataFormat::NONE; + } +} + +template +std::string TransformedName(const std::string &input_name, + const std::string &tag, + const T value) { + std::stringstream ss; + ss << input_name << "_" << tag << "_" << value; + return ss.str(); +} + +#ifdef MACE_ENABLE_OPENCL +bool TransformRequiredOp(const std::string &op_type) { + static const std::unordered_set kNoTransformOp = { + "Shape", "InferConv2dShape" + }; + return kNoTransformOp.count(op_type) == 0; +} +#endif // MACE_ENABLE_OPENCL + +void BuildTransposeOpDef( + const std::string &input_name, + const std::string &output_name, + const std::vector &output_shape, + const std::vector dst_dims, + const DataType dt, + DeviceType device_type, + OperatorDef *op_def) { + std::string op_name = "mace_node_" + output_name; + op_def->set_name(op_name); + op_def->set_type("Transpose"); + op_def->add_input(input_name); + op_def->add_output(output_name); + op_def->set_device_type(device_type); + Argument *arg = op_def->add_arg(); + arg->set_name("dims"); + for (auto dim : dst_dims) { + arg->add_ints(dim); + } + arg = op_def->add_arg(); + arg->set_name("T"); + arg->set_i(static_cast(dt)); + if (!output_shape.empty()) { + OutputShape *shape = op_def->add_output_shape(); + for (auto value : output_shape) { + shape->add_dims(value); + } + } +} + +} // namespace + +NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry, + const Workspace *ws) + : op_registry_(op_registry), ws_(ws) {} + +MaceStatus NetDefAdapter::AdaptNetDef( + const NetDef *net_def, + Device *target_device, + NetDef *target_net_def) { + MACE_LATENCY_LOGGER(1, "Adapting original NetDef"); + // Copy from original op_def, leave ops alone. + target_net_def->mutable_arg()->CopyFrom(net_def->arg()); + target_net_def->mutable_tensors()->CopyFrom(net_def->tensors()); + target_net_def->mutable_input_info()->CopyFrom(net_def->input_info()); + target_net_def->mutable_output_info()->CopyFrom(net_def->output_info()); + + std::unique_ptr cpu_device = make_unique( + target_device->cpu_runtime()->num_threads(), + target_device->cpu_runtime()->policy(), + &(target_device->cpu_runtime()->thread_pool())); + + // quantize model flag + bool is_quantized_model = IsQuantizedModel(*net_def); + // Const tensors(filter) -> shape + std::unordered_map> tensor_shape_map; + // Output tensors -> information + TensorInfoMap output_map; + // output tensor : related information + std::unordered_set transformed_set; + + for (auto &tensor : net_def->tensors()) { + tensor_shape_map[tensor.name()] = + std::vector(tensor.dims().begin(), tensor.dims().end()); + } + + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (target_device->device_type() == DeviceType::CPU) { + mem_type = MemoryType::CPU_BUFFER; + } else if (target_device->device_type() == DeviceType::GPU) { + mem_type = MemoryType::GPU_BUFFER; + } else { + LOG(FATAL) << "MACE do not support the device type: " + << target_device->device_type(); + } + + int input_size = target_net_def->input_info_size(); + for (int i = 0; i < input_size; ++i) { + auto input_info = target_net_def->mutable_input_info(i); + auto input_data_format = static_cast( + input_info->data_format()); + DataFormat expected_data_format = GetDefaultDataFormat( + target_device->device_type(), is_quantized_model); + std::vector input_shape(input_info->dims().begin(), + input_info->dims().end()); + if (input_data_format != DataFormat::NONE + && input_data_format != expected_data_format + && input_shape.size() == 4) { + if (input_data_format == DataFormat::NHWC + && expected_data_format == DataFormat::NCHW) { + std::vector dst_dims{0, 3, 1, 2}; + input_data_format = DataFormat::NCHW; + input_shape = TransposeShape(input_shape, dst_dims); + } else if (input_data_format == DataFormat::NCHW + && expected_data_format == DataFormat::NHWC) { + std::vector dst_dims{0, 2, 3, 1}; + input_data_format = DataFormat::NHWC; + input_shape = TransposeShape(input_shape, dst_dims); + } + input_info->set_data_format(static_cast(input_data_format)); + int input_shape_size = input_shape.size(); + for (int j = 0; j < input_shape_size; ++j) { + input_info->set_dims(j, input_shape[j]); + } + } + output_map.emplace(input_info->name(), InternalOutputInfo( + mem_type, input_info->data_type(), + input_data_format, input_shape, -1)); + } + + OpConditionContext context(ws_, &tensor_shape_map); + DataFormat op_output_data_format; + MemoryType op_output_mem_type; + for (int idx = 0; idx < net_def->op_size(); ++idx) { + OperatorDef op_def(net_def->op(idx)); + context.set_operator_def(&op_def); + // Select device + MACE_RETURN_IF_ERROR(this->AdaptDevice(&context, + target_device, + cpu_device.get(), + output_map, + target_net_def, + &op_def)); + + // Adapt data type + MACE_RETURN_IF_ERROR(this->AdaptDataType(&context, + &op_def)); + + if (op_def.device_type() == DeviceType::GPU) { + MACE_RETURN_IF_ERROR(this->AdaptDataFormat(&context, + &op_def, + is_quantized_model, + &output_map, + &transformed_set, + &op_output_data_format, + target_net_def)); + MACE_RETURN_IF_ERROR(this->AdaptMemoryType(&context, + &op_def, + &output_map, + &transformed_set, + &op_output_mem_type, + target_net_def)); + } else { + MACE_RETURN_IF_ERROR(this->AdaptMemoryType(&context, + &op_def, + &output_map, + &transformed_set, + &op_output_mem_type, + target_net_def)); + MACE_RETURN_IF_ERROR(this->AdaptDataFormat(&context, + &op_def, + is_quantized_model, + &output_map, + &transformed_set, + &op_output_data_format, + target_net_def)); + } + + int output_size = op_def.output_size(); + for (int out_idx = 0; out_idx < output_size; ++out_idx) { + DataType dt; + if (op_def.output_type_size() == op_def.output_size()) { + dt = op_def.output_type(out_idx); + } else { + dt = static_cast( + ProtoArgHelper::GetOptionalArg( + op_def, "T", static_cast(DataType::DT_FLOAT))); + } + output_map.emplace( + op_def.output(out_idx), + InternalOutputInfo( + op_output_mem_type, + dt, + op_output_data_format, + op_def.output_shape().empty() ? + std::vector() : + std::vector( + op_def.output_shape(out_idx).dims().begin(), + op_def.output_shape(out_idx).dims().end()), + target_net_def->op_size())); + } + // Add op to target net + target_net_def->add_op()->CopyFrom(op_def); + } + +#ifdef MACE_ENABLE_OPENCL + if (target_device->device_type() == DeviceType::GPU) { + // Add buffer transform for GPU if necessary + MemoryType target_mem_type = MemoryType::GPU_BUFFER; + for (auto &output_info : net_def->output_info()) { + auto &internal_output_info = output_map.at(output_info.name()); + if ((internal_output_info.mem_type != target_mem_type && + internal_output_info.mem_type != MemoryType::CPU_BUFFER) || + internal_output_info.dtype != output_info.data_type()) { + VLOG(1) << "Add Transform operation to transform output tensor '" + << output_info.name() << "', from memory type " + << internal_output_info.mem_type + << " to " << target_mem_type + << ", from Data Type " << internal_output_info.dtype + << " to " << output_info.data_type(); + std::string t_output_name = TransformedName(output_info.name(), + "mem_type", + target_mem_type); + auto output_op_def = target_net_def->mutable_op( + internal_output_info.op_idx); + int output_size = output_op_def->output_size(); + for (int i = 0; i < output_size; ++i) { + if (output_op_def->output(i) == output_info.name()) { + output_op_def->set_output(i, t_output_name); + } + } + auto transformed_op_def = target_net_def->add_op(); + OpenCLUtil::BuildTransformOpDef( + t_output_name, + internal_output_info.shape, + output_info.name(), + output_info.data_type(), + OpenCLBufferType::IN_OUT_CHANNEL, + target_mem_type, + internal_output_info.data_format, + transformed_op_def); + // set data format arg + SetProtoArg( + transformed_op_def, + "data_format", + static_cast(internal_output_info.data_format)); + // set output memory type argument + SetProtoArg(transformed_op_def, + OutputMemoryTypeTagName(), + target_mem_type); + } + } + } +#endif // MACE_ENABLE_OPENCL + + VLOG(1) << DebugString(target_net_def); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context, + Device *target_device, + Device *cpu_device, + const TensorInfoMap &output_map, + const NetDef *net_def, + OperatorDef *op_def) { + VLOG(3) << "Adapt device for op " << op_def->name(); + DeviceType target_device_type = target_device->device_type(); + DeviceType device_type = DeviceType::CPU; + context->set_device(cpu_device); + if (target_device_type != DeviceType::CPU) { + std::vector producer_devices; + for (auto input : op_def->input()) { + if (output_map.count(input) == 1) { + if (output_map.at(input).op_idx < 0) { + producer_devices.push_back(target_device_type); + } else { + producer_devices.push_back( + static_cast( + net_def->op(output_map.at(input).op_idx).device_type())); + } + } + } + // Get available devices + auto available_devices = + op_registry_->AvailableDevices(op_def->type(), context); + device_type = net_optimizer_.SelectBestDevice(op_def, + target_device_type, + available_devices, + producer_devices); + if (device_type == target_device_type) { + context->set_device(target_device); + } else { + LOG(INFO) << "Op " << op_def->name() << " fall back to CPU"; + } + } + op_def->set_device_type(device_type); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus NetDefAdapter::AdaptDataType(OpConditionContext *context, + OperatorDef *op_def) { + MACE_UNUSED(context); + // Where to add logic to support mixing precision + // Adjust data type of op ran on CPU + DataType dtype = static_cast( + ProtoArgHelper::GetOptionalArg( + *op_def, "T", static_cast(DT_FLOAT))); + if (op_def->device_type() == DeviceType::CPU && dtype == DT_HALF) { + SetProtoArg(op_def, "T", static_cast(DataType::DT_FLOAT)); + } + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus NetDefAdapter::AdaptDataFormat( + OpConditionContext *context, + OperatorDef *op_def, + bool is_quantized_model, + TensorInfoMap *output_map, + std::unordered_set *transformed_set, + DataFormat *op_output_df, + NetDef *target_net_def) { + VLOG(3) << "Adapt data format for op " << op_def->name(); + DataFormat op_data_format = + static_cast(ProtoArgHelper::GetOptionalArg( + *op_def, "data_format", + static_cast(DataFormat::NONE))); + // adjust the data format of operation + if (op_data_format == DataFormat::AUTO) { + op_data_format = GetDefaultDataFormat( + static_cast(op_def->device_type()), is_quantized_model); + SetProtoArg(op_def, "data_format", static_cast(op_data_format)); + if (op_data_format == DataFormat::NCHW) { + int output_shape_size = op_def->output_shape_size(); + for (int i = 0; i < output_shape_size; ++i) { + auto output_shape = op_def->mutable_output_shape(i); + MACE_CHECK(output_shape->dims_size() == 4, + "Output shape should be 4D if the of has data format. ", + op_def->name()); + // transpose output shape format from NHWC to NCHW + int64_t height = output_shape->dims(1); + int64_t width = output_shape->dims(2); + output_shape->set_dims(1, output_shape->dims(3)); + output_shape->set_dims(2, height); + output_shape->set_dims(3, width); + } + } + } + *op_output_df = op_data_format; + + // the output memory type of transpose op is based on the consumer op's device + MemoryType target_mem_type = MemoryType::CPU_BUFFER; + if (op_def->device_type() == DeviceType::GPU) { + target_mem_type = MemoryType::GPU_BUFFER; + } + auto inputs_data_format = op_registry_->InputsDataFormat(op_def->type(), + context); + DataFormat src_df, dst_df; + int input_size = op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + if (output_map->count(op_def->input(i)) == 0) { + // check this input is const tensor(filter) + MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr + && ws_->GetTensor(op_def->input(i))->is_weight(), + "Tensor ", op_def->input(i), " of ", + op_def->name(), " is not allocated by Workspace ahead"); + continue; + } + src_df = output_map->at(op_def->input(i)).data_format; + dst_df = inputs_data_format[i]; + if (src_df == DataFormat::NONE + || dst_df == DataFormat::NONE + || output_map->at(op_def->input(i)).shape.size() != 4) { + continue; + } + if (src_df != dst_df) { + std::string transformed_name = TransformedName(op_def->input(i), + "data_format", static_cast(dst_df)); + if (transformed_set->count(transformed_name) == 0) { + VLOG(1) << "Add Transpose operation " << op_def->name() + << " to transpose tensor " + << op_def->input(i) << "', from data format " + << static_cast(src_df) << " to " + << static_cast(dst_df); + // Only support transpose between NHWC and NCHW for now. + std::vector dst_dims; + if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) { + dst_dims = {0, 2, 3, 1}; + } else if (src_df == DataFormat::NHWC && dst_df == DataFormat::NCHW) { + dst_dims = {0, 3, 1, 2}; + } else { + LOG(FATAL) << "Encounter unsupported data format transpose from " + << static_cast(src_df) << " to " + << static_cast(dst_df); + } + auto &input_info = output_map->at(op_def->input(i)); + auto output_shape = input_info.shape.empty() ? + std::vector() : + TransposeShape(input_info.shape, + dst_dims); + OperatorDef *transpose_op_def = target_net_def->add_op(); + BuildTransposeOpDef( + op_def->input(i), + transformed_name, + output_shape, + dst_dims, + input_info.dtype, + DeviceType::CPU, + transpose_op_def); + // set data format arg + SetProtoArg(transpose_op_def, + "data_format", + static_cast(dst_df)); + // set output memory type argument + SetProtoArg(transpose_op_def, + OutputMemoryTypeTagName(), + target_mem_type); + + // update output information map + output_map->emplace( + transformed_name, + InternalOutputInfo( + target_mem_type, + input_info.dtype, + dst_df, + output_shape, + target_net_def->op_size() - 1)); + // record transformed tensors + transformed_set->insert(transformed_name); + } + // update original op_def's input + op_def->set_input(i, transformed_name); + } + } + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus NetDefAdapter::AdaptMemoryType( + OpConditionContext *context, + OperatorDef *op_def, + NetDefAdapter::TensorInfoMap *output_map, + std::unordered_set *transformed_set, + MemoryType *op_output_mem_types, + NetDef *target_net_def) { + VLOG(3) << "Adapt memory type for op " << op_def->name(); + // Get expected output memory type + // (only support one kind of memory type for multiple outputs) + op_registry_->GetInOutMemoryTypes(op_def->type(), context); +#ifdef MACE_ENABLE_OPENCL + // if op is memory-unused op, no transformation + if (TransformRequiredOp(op_def->type())) { + int input_size = op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + if (output_map->count(op_def->input(i)) == 0) { + MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr + && ws_->GetTensor(op_def->input(i))->is_weight(), + "Tensor ", op_def->input(i), " of ", + op_def->name(), " not allocated"); + continue; + } + auto &input_info = output_map->at(op_def->input(i)); + // check whether to do transform + MemoryType src_mem_type = input_info.mem_type; + MemoryType dst_mem_type = context->GetInputMemType(i); + auto wanted_input_dtype = context->GetInputDataType(i); + if (src_mem_type != dst_mem_type || + (input_info.dtype != wanted_input_dtype && + (src_mem_type != MemoryType::CPU_BUFFER + || dst_mem_type != MemoryType::CPU_BUFFER))) { + auto transformed_name = TransformedName(op_def->input(i), + "mem_type", + dst_mem_type); + // check whether the tensor has been transformed + if (transformed_set->count(transformed_name) == 0) { + VLOG(1) << "Add Transform operation " << op_def->name() + << " to transform tensor " + << op_def->input(i) << "', from memory type " + << input_info.mem_type << " to " + << dst_mem_type; + OperatorDef *transformed_op_def = target_net_def->add_op(); + OpenCLUtil::BuildTransformOpDef( + op_def->input(i), + input_info.shape, + transformed_name, + wanted_input_dtype, + context->GetInputOpenCLBufferType(i), + dst_mem_type, + input_info.data_format, + transformed_op_def); + // set data format arg + SetProtoArg(transformed_op_def, + "data_format", + static_cast(input_info.data_format)); + // set output memory type argument + SetProtoArg(transformed_op_def, + OutputMemoryTypeTagName(), + dst_mem_type); + + // update output information map + output_map->emplace( + transformed_name, + InternalOutputInfo( + dst_mem_type, + context->GetInputDataType(i), + input_info.data_format, + input_info.shape, + target_net_def->op_size() - 1)); + // record transformed tensors + transformed_set->insert(transformed_name); + } + // update original op_def's input + op_def->set_input(i, transformed_name); + } + } + } +#else + MACE_UNUSED(output_map); + MACE_UNUSED(transformed_set); + MACE_UNUSED(target_net_def); +#endif // MACE_ENABLE_OPENCL + *op_output_mem_types = context->output_mem_type(); + SetProtoArg(op_def, + OutputMemoryTypeTagName(), + context->output_mem_type()); + return MaceStatus::MACE_SUCCESS; +} + +std::string NetDefAdapter::DebugString(const NetDef *net_def) { + std::stringstream sstream; + auto DeviceTypeToStrFunc = [](DeviceType device_type) -> std::string { + if (device_type == DeviceType::CPU) { + return "CPU"; + } else if (device_type == DeviceType::GPU) { + return "GPU"; + } else { + return "Unknown"; + } + }; + auto MemoryTypeToStrFunc = [](MemoryType type) -> std::string { + if (type == MemoryType::CPU_BUFFER) { + return "CPU_BUFFER"; + } else if (type == MemoryType::GPU_BUFFER) { + return "GPU_BUFFER"; + } else if (type == MemoryType::GPU_IMAGE) { + return "GPU_IMAGE"; + } else { + return "Unknown"; + } + }; + auto DataFormatToStrFunc = [](DataFormat type) -> std::string { + if (type == DataFormat::NHWC) { + return "NHWC"; + } else if (type == DataFormat::NCHW) { + return "NCHW"; + } else if (type == DataFormat::NONE) { + return "NONE"; + } else if (type == DataFormat::AUTO) { + return "AUTO"; + } else if (type == DataFormat::OIHW) { + return "OIHW"; + } else { + return "Unknown"; + } + }; + for (auto &op : net_def->op()) { + std::string device_type = DeviceTypeToStrFunc( + static_cast(op.device_type())); + std::string data_type = DataTypeToString(static_cast( + ProtoArgHelper::GetOptionalArg( + op, "T", static_cast(DT_FLOAT)))); + std::string mem_type = MemoryTypeToStrFunc( + static_cast( + ProtoArgHelper::GetOptionalArg( + op, OutputMemoryTypeTagName(), + static_cast(MemoryType::CPU_BUFFER)))); + std::string data_format = DataFormatToStrFunc( + static_cast( + ProtoArgHelper::GetOptionalArg( + op, "data_format", static_cast(DataFormat::NONE)))); + + sstream << std::endl; + sstream << "{" << std::endl; + sstream << " name: " << op.name() << std::endl; + sstream << " type: " << op.type() << std::endl; + sstream << " device: " << device_type << std::endl; + sstream << " data type: " << data_type << std::endl; + sstream << " data format: " << data_format << std::endl; + sstream << " memory type: " << mem_type << std::endl; + sstream << " inputs: ["; + for (auto input : op.input()) { + sstream << input << ", "; + } + sstream << "]" << std::endl; + sstream << " outputs: ["; + for (auto output : op.output()) { + sstream << output << ", "; + } + sstream << "]" << std::endl; + sstream << " output shapes: ["; + for (auto output_shape : op.output_shape()) { + sstream << "("; + for (auto dim : output_shape.dims()) + sstream << dim << ","; + sstream << ") "; + } + sstream << "]" << std::endl; + sstream << "}"; + } + return sstream.str(); +} + +} // namespace mace diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..d821ed810c32d2ef7d5644430948ad010c63e646 --- /dev/null +++ b/mace/core/net_def_adapter.h @@ -0,0 +1,116 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_NET_DEF_ADAPTER_H_ +#define MACE_CORE_NET_DEF_ADAPTER_H_ +#include +#include +#include +#include +#include + +#include "mace/core/types.h" +#include "mace/proto/mace.pb.h" +#include "mace/port/port.h" +#include "mace/core/operator.h" +#include "mace/core/net_optimizer.h" + +namespace mace { + +class OpRegistryBase; +class Workspace; +class Device; + +/// Conventions: +/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW) +/// 2. if Op with DataFormat::AUTO, the arguments of this op +/// is formatted to NHWC +class NetDefAdapter { + public: + NetDefAdapter(const OpRegistryBase *op_registry, + const Workspace *ws); + // Adapt original net_def to a better net. + // 1. Adapt device: choose best device for every op in the net. + // 2. Adapt data type: Add data type related transform ops + // for mixing precision. + // 3. Adapt data format: confirm data format of every op + // and add transpose if necessary. + // 4. Adapt memory type: Add BufferTransform if necessary + // for transforming memory type between ops. + MaceStatus AdaptNetDef( + const NetDef *net_def, + Device *target_device, + NetDef *target_net_def); + + public: + NetDefAdapter(const NetDefAdapter&) = delete; + NetDefAdapter(const NetDefAdapter&&) = delete; + NetDefAdapter &operator=(const NetDefAdapter &) = delete; + NetDefAdapter &operator=(const NetDefAdapter &&) = delete; + + private: + struct InternalOutputInfo { + InternalOutputInfo(const MemoryType mem_type, + const DataType dtype, + const DataFormat data_format, + const std::vector &shape, + int op_idx) + : mem_type(mem_type), dtype(dtype), data_format(data_format), + shape(shape), op_idx(op_idx) {} + + MemoryType mem_type; + DataType dtype; + DataFormat data_format; + std::vector shape; // tensor shape + int op_idx; // operation which generate the tensor + }; + + typedef std::unordered_map TensorInfoMap; + + private: + MaceStatus AdaptDevice(OpConditionContext *context, + Device *target_device, + Device *cpu_device, + const TensorInfoMap &output_map, + const NetDef *net_def, + OperatorDef *op); + MaceStatus AdaptDataType(OpConditionContext *context, + OperatorDef *op); + MaceStatus AdaptDataFormat( + OpConditionContext *context, + OperatorDef *op, + bool is_quantized_model, + TensorInfoMap *output_map, + std::unordered_set *transformed_set, + DataFormat *op_output_df, + NetDef *target_net_def); + + MaceStatus AdaptMemoryType( + OpConditionContext *context, + OperatorDef *op_def, + TensorInfoMap *output_map, + std::unordered_set *transformed_set, + MemoryType *op_output_mem_types, + NetDef *target_net_def); + + std::string DebugString(const NetDef *net_def); + + private: + const OpRegistryBase *op_registry_; + const Workspace *ws_; + NetOptimizer net_optimizer_; +}; + +} // namespace mace +#endif // MACE_CORE_NET_DEF_ADAPTER_H_ diff --git a/mace/core/net_optimizer.cc b/mace/core/net_optimizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..4382b51b37fc76ea36dfebf4da802cd85bd78130 --- /dev/null +++ b/mace/core/net_optimizer.cc @@ -0,0 +1,50 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/net_optimizer.h" + +#include + +namespace mace { + +DeviceType NetOptimizer::SelectBestDevice( + const OperatorDef *op_def, + DeviceType target_device_type, + const std::set &available_devices, + const std::vector &inputs_op_devices) { + static const std::set kComputeIntensiveOps = { + "Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d", + "FullyConnected" + }; + // CPU is the device to fall back + DeviceType best_device = DeviceType::CPU; + if (available_devices.count(target_device_type) == 1) { + best_device = target_device_type; + } + if (best_device == DeviceType::CPU) { + return best_device; + } + // Put compute-intensive ops in target device + if (kComputeIntensiveOps.count(op_def->type()) == 1) { + return best_device; + } + // Greedy strategy: Use input op's device type as current op's device + for (auto device_type : inputs_op_devices) { + if (device_type != best_device) { + best_device = device_type; + } + } + return best_device; +} +} // namespace mace diff --git a/mace/core/net_optimizer.h b/mace/core/net_optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..23f1897cc73f143fdac0b39eca2070b6d9714263 --- /dev/null +++ b/mace/core/net_optimizer.h @@ -0,0 +1,48 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_NET_OPTIMIZER_H_ +#define MACE_CORE_NET_OPTIMIZER_H_ + +#include +#include + +#include "mace/port/port.h" +#include "mace/proto/mace.pb.h" + +namespace mace { + +/// Any optimization for Net could be put in here in the future. +class NetOptimizer { + public: + /// Select best device for the op to support mixing usage of CPU and GPU. + /// Greedy strategy: one way to the end. If the op fallback to CPU, then + /// the follow-up ops will run on CPU too util meet + /// some compute-intensive ops(Convolution) to + /// reduce the memory copy between CPU and GPU. + /// Simple but effective. + /// + /// \param op_def the op + /// \param target_device target device to run on + /// \param available_devices available devices of the op + /// \param inputs_op_devices devices of father ops run on + /// \return Best device for the op_def + DeviceType SelectBestDevice(const OperatorDef *op_def, + DeviceType target_device, + const std::set &available_devices, + const std::vector &inputs_op_devices); +}; + +} // namespace mace +#endif // MACE_CORE_NET_OPTIMIZER_H_ diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 8fae1bd8a710f0fb9f6536960ae195ab6b94cba1..605ae3a759b9beae2d930263f20316490c15fd1b 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -20,36 +20,23 @@ #include "mace/core/operator.h" namespace mace { - -OpConstructContext::OpConstructContext(Workspace *ws) - : operator_def_(nullptr), - ws_(ws), - device_(nullptr), - tensor_shape_info_(nullptr) {} - -OpConstructContext::OpConstructContext( - mace::Workspace *ws, - mace::OpConstructContext::TensorShapeMap *info) +OpConditionContext::OpConditionContext( + const Workspace *ws, + OpConditionContext::TensorShapeMap *info) : operator_def_(nullptr), ws_(ws), device_(nullptr), tensor_shape_info_(info) {} -void OpConstructContext::set_operator_def( - std::shared_ptr operator_def) { +void OpConditionContext::set_operator_def( + const OperatorDef *operator_def) { operator_def_ = operator_def; input_data_types_.clear(); } -void OpConstructContext::set_output_mem_type(mace::MemoryType type) { - MACE_CHECK(operator_def_ != nullptr); - output_mem_type_ = type; - input_mem_types_.clear(); -} - -void OpConstructContext::SetInputInfo(size_t idx, - mace::MemoryType mem_type, - mace::DataType dt) { +void OpConditionContext::SetInputInfo(size_t idx, + MemoryType mem_type, + DataType dt) { if (input_mem_types_.empty()) { // the default inputs' memory types are same as output memory type. input_mem_types_.resize(operator_def_->input_size(), output_mem_type_); @@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx, input_data_types_[idx] = dt; } -MemoryType OpConstructContext::GetInputMemType(size_t idx) const { +void OpConditionContext::set_output_mem_type(MemoryType type) { + MACE_CHECK(operator_def_ != nullptr); + output_mem_type_ = type; + input_mem_types_.clear(); +} + +MemoryType OpConditionContext::GetInputMemType(size_t idx) const { if (input_mem_types_.empty()) { return output_mem_type_; } @@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const { return input_mem_types_[idx]; } -DataType OpConstructContext::GetInputDataType(size_t idx) const { +DataType OpConditionContext::GetInputDataType(size_t idx) const { if (input_data_types_.empty()) { // the default inputs' data types are same as operation's data type. return static_cast( @@ -87,17 +80,17 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const { } #ifdef MACE_ENABLE_OPENCL -void OpConstructContext::SetInputOpenCLBufferType( +void OpConditionContext::SetInputOpenCLBufferType( size_t idx, OpenCLBufferType buffer_type) { if (input_opencl_buffer_types_.empty()) { // the default inputs' memory types are same as output memory type. input_opencl_buffer_types_.resize(operator_def_->input_size(), - OpenCLBufferType::IN_OUT_CHANNEL); + OpenCLBufferType::IN_OUT_CHANNEL); } MACE_CHECK(idx < input_opencl_buffer_types_.size()); input_opencl_buffer_types_[idx] = buffer_type; } -OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType( +OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType( size_t idx) const { if (input_opencl_buffer_types_.empty()) { return OpenCLBufferType::IN_OUT_CHANNEL; @@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType( } #endif // MACE_ENABLE_OPENCL +OpConstructContext::OpConstructContext(Workspace *ws) + : operator_def_(nullptr), + ws_(ws), + device_(nullptr) {} + +void OpConstructContext::set_operator_def( + std::shared_ptr operator_def) { + operator_def_ = operator_def; +} + OpInitContext::OpInitContext(Workspace *ws, Device *device) : ws_(ws), device_(device) {} @@ -202,19 +205,40 @@ const std::string OpKeyBuilder::Build() { } // namespace OpRegistrationInfo::OpRegistrationInfo() { - device_placer = [this](OpConstructContext *context) -> std::set { - auto op = context->operator_def(); - // The GPU ops only support 4D In/Out tensor by default - if (this->devices.count(DeviceType::CPU) == 1 && - op->output_shape_size() == op->output_size() && - op->output_shape(0).dims_size() != 4) { - return { DeviceType::CPU }; - } + // default device type placer + device_placer = [this](OpConditionContext *context) -> std::set { + MACE_UNUSED(context); return this->devices; }; + + // default input and output memory type setter + memory_type_setter = [](OpConditionContext *context) -> void { + if (context->device()->device_type() == DeviceType::GPU) { +#ifdef MACE_ENABLE_OPENCL + if (context->device()->gpu_runtime()->UseImageMemory()) { + context->set_output_mem_type(MemoryType::GPU_IMAGE); + } else { + context->set_output_mem_type(MemoryType::GPU_BUFFER); + } +#endif // MACE_ENABLE_OPENCL + } else { + context->set_output_mem_type(MemoryType::CPU_BUFFER); + } + }; + + data_format_selector = [](OpConditionContext *context) + -> std::vector { + DataFormat op_data_format = + static_cast( + ProtoArgHelper::GetOptionalArg( + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); + return std::vector(context->operator_def()->input_size(), + op_data_format); + }; } -void OpRegistrationInfo::AddDevice(mace::DeviceType device) { +void OpRegistrationInfo::AddDevice(DeviceType device) { devices.insert(device); } @@ -226,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) { MaceStatus OpRegistryBase::Register( const std::string &op_type, - const mace::DeviceType device_type, - const mace::DataType dt, - mace::OpRegistrationInfo::OpCreator creator) { + const DeviceType device_type, + const DataType dt, + OpRegistrationInfo::OpCreator creator) { if (registry_.count(op_type) == 0) { registry_[op_type] = std::unique_ptr( new OpRegistrationInfo); @@ -255,13 +279,29 @@ MaceStatus OpRegistryBase::Register( } const std::set OpRegistryBase::AvailableDevices( - const std::string &op_type, OpConstructContext *context) const { + const std::string &op_type, OpConditionContext *context) const { MACE_CHECK(registry_.count(op_type) != 0, op_type, " operation is not registered."); return registry_.at(op_type)->device_placer(context); } +void OpRegistryBase::GetInOutMemoryTypes( + const std::string &op_type, + OpConditionContext *context) const { + MACE_CHECK(registry_.count(op_type) != 0, + op_type, " operation is not registered."); + return registry_.at(op_type)->memory_type_setter(context); +} + +const std::vector OpRegistryBase::InputsDataFormat( + const std::string &op_type, + OpConditionContext *context) const { + MACE_CHECK(registry_.count(op_type) != 0, + op_type, " operation is not registered."); + return registry_.at(op_type)->data_format_selector(context); +} + std::unique_ptr OpRegistryBase::CreateOperation( OpConstructContext *context, DeviceType device_type) const { @@ -269,15 +309,6 @@ std::unique_ptr OpRegistryBase::CreateOperation( DataType dtype = static_cast( ProtoArgHelper::GetOptionalArg( *operator_def, "T", static_cast(DT_FLOAT))); - if (device_type == DeviceType::CPU && dtype == DT_HALF) { - int arg_size = operator_def->arg_size(); - for (int i = 0; i < arg_size; ++i) { - if (operator_def->arg(i).name() == "T") { - operator_def->mutable_arg(i)->set_i(DT_FLOAT); - } - } - dtype = DT_FLOAT; - } VLOG(1) << "Creating operator " << operator_def->name() << "(" << operator_def->type() << "<" << dtype << ">" << ") on " << device_type; @@ -308,9 +339,30 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc( return *this; } +OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter( + OpRegistrationInfo::MemoryTypeSetter setter) { + memory_type_setter_ = setter; + return *this; +} + +OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector( + OpRegistrationInfo::DataFormatSelector selector) { + data_format_selector_ = selector; + return *this; +} + void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const { - if (info != nullptr && placer_) { - info->device_placer = placer_; + if (info != nullptr) { + if (placer_) { + info->device_placer = placer_; + } + if (memory_type_setter_) { + info->memory_type_setter = memory_type_setter_; + } + + if (data_format_selector_) { + info->data_format_selector = data_format_selector_; + } } } diff --git a/mace/core/operator.h b/mace/core/operator.h index e59af9ab166a5ace99bc7cc59b17a025cc0b1645..9430d90d05be00ac2ae1e7034c4ea3f8c5dadfe2 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -32,22 +32,20 @@ namespace mace { -// memory_optimizer, device -class OpConstructContext { +// OpConditionContext has all information used for choosing proper Op +class OpConditionContext { + public: typedef std::unordered_map> TensorShapeMap; + OpConditionContext(const Workspace *ws, TensorShapeMap *info); + ~OpConditionContext() = default; - public: - explicit OpConstructContext(Workspace *ws); - OpConstructContext(Workspace *ws, TensorShapeMap *info); - ~OpConstructContext() = default; + void set_operator_def(const OperatorDef* operator_def); - void set_operator_def(std::shared_ptr operator_def); - - inline std::shared_ptr operator_def() const { + inline const OperatorDef *operator_def() const { return operator_def_; } - inline Workspace *workspace() const { + inline const Workspace *workspace() const { return ws_; } @@ -81,8 +79,8 @@ class OpConstructContext { #endif // MACE_ENABLE_OPENCL private: - std::shared_ptr operator_def_; - Workspace *ws_; + const OperatorDef *operator_def_; + const Workspace *ws_; Device *device_; TensorShapeMap *tensor_shape_info_; // used for memory transform @@ -94,6 +92,46 @@ class OpConstructContext { #endif // MACE_ENABLE_OPENCL }; +// memory_optimizer, device +class OpConstructContext { + typedef std::unordered_map> TensorShapeMap; + + public: + explicit OpConstructContext(Workspace *ws); + ~OpConstructContext() = default; + + void set_operator_def(std::shared_ptr operator_def); + + inline std::shared_ptr operator_def() const { + return operator_def_; + } + + inline Workspace *workspace() const { + return ws_; + } + + inline void set_device(Device* device) { + device_ = device; + } + + inline Device *device() const { + return device_; + } +#ifdef MACE_ENABLE_OPENCL + inline MemoryType GetOpMemoryType() const { + return static_cast( + ProtoArgHelper::GetOptionalArg( + *operator_def_, OutputMemoryTypeTagName(), + static_cast(MemoryType::CPU_BUFFER))); + } +#endif // MACE_ENABLE_OPENCL + + private: + std::shared_ptr operator_def_; + Workspace *ws_; + Device *device_; +}; + // memory_optimizer, device class OpInitContext { public: @@ -207,8 +245,11 @@ struct OpRegistrationInfo { public: typedef std::function(OpConstructContext *)> OpCreator; - typedef std::function(OpConstructContext *)> + typedef std::function(OpConditionContext *)> DevicePlacer; + typedef std::function MemoryTypeSetter; + typedef std::function(OpConditionContext *)> + DataFormatSelector; OpRegistrationInfo(); @@ -219,6 +260,8 @@ struct OpRegistrationInfo { std::set devices; std::unordered_map creators; DevicePlacer device_placer; + MemoryTypeSetter memory_type_setter; + DataFormatSelector data_format_selector; }; class OpConditionBuilder { @@ -230,11 +273,21 @@ class OpConditionBuilder { OpConditionBuilder &SetDevicePlacerFunc( OpRegistrationInfo::DevicePlacer placer); + // If you set input memory type for specified Op, + // you must call OpConditionContext::set_output_mem_type + OpConditionBuilder &SetInputMemoryTypeSetter( + OpRegistrationInfo::MemoryTypeSetter setter); + + OpConditionBuilder &SetInputsDataFormatSelector( + OpRegistrationInfo::DataFormatSelector selector); + void Finalize(OpRegistrationInfo *info) const; private: std::string type_; OpRegistrationInfo::DevicePlacer placer_; + OpRegistrationInfo::MemoryTypeSetter memory_type_setter_; + OpRegistrationInfo::DataFormatSelector data_format_selector_; }; @@ -250,7 +303,13 @@ class OpRegistryBase { MaceStatus Register(const OpConditionBuilder &builder); const std::set AvailableDevices( - const std::string &op_type, OpConstructContext *context) const; + const std::string &op_type, OpConditionContext *context) const; + + void GetInOutMemoryTypes( + const std::string &op_type, OpConditionContext *context) const; + + const std::vector InputsDataFormat( + const std::string &op_type, OpConditionContext *context) const; std::unique_ptr CreateOperation( OpConstructContext *context, diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc index ca11414668d6e95f3d6bd70a13f48a312ea1c616..20ae6a2b5c279f1f2564011e98740fe56b83606b 100644 --- a/mace/core/runtime/opencl/opencl_util.cc +++ b/mace/core/runtime/opencl/opencl_util.cc @@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector &shape, /* NHWC */ } } -std::shared_ptr OpenCLUtil::CreateTransformOpDef( +void OpenCLUtil::BuildTransformOpDef( const std::string &input_name, const std::vector &input_shape, const std::string &output_name, const mace::DataType dt, const OpenCLBufferType buffer_type, const mace::MemoryType mem_type, - bool has_data_format) { - std::unique_ptr op(new OperatorDef); + DataFormat data_format, + OperatorDef *op_def) { std::string op_name = "mace_node_" + output_name; - op->set_name(op_name); - op->set_type("BufferTransform"); - op->add_input(input_name); - op->add_output(output_name); - Argument *arg = op->add_arg(); + op_def->set_name(op_name); + op_def->set_type("BufferTransform"); + op_def->add_input(input_name); + op_def->add_output(output_name); + op_def->set_device_type(DeviceType::GPU); + Argument *arg = op_def->add_arg(); arg->set_name("buffer_type"); arg->set_i(static_cast(buffer_type)); - arg = op->add_arg(); + arg = op_def->add_arg(); arg->set_name("mem_type"); arg->set_i(static_cast(mem_type)); - arg = op->add_arg(); + arg = op_def->add_arg(); arg->set_name("T"); arg->set_i(static_cast(dt)); - arg = op->add_arg(); - arg->set_name("has_data_format"); - arg->set_i(has_data_format); + arg = op_def->add_arg(); + arg->set_name("data_format"); + arg->set_i(static_cast(data_format)); if (!input_shape.empty()) { - OutputShape *shape = op->add_output_shape(); + OutputShape *shape = op_def->add_output_shape(); for (auto value : input_shape) { shape->add_dims(value); } } - return std::move(op); } } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h index ea0e239ee17c6826f23a73412ebc0a71d6dd25cf..2d5e2abf0d77b56d7305b6a64a187af39a3c1e0d 100644 --- a/mace/core/runtime/opencl/opencl_util.h +++ b/mace/core/runtime/opencl/opencl_util.h @@ -43,14 +43,15 @@ class OpenCLUtil { std::vector *image_shape, const int wino_blk_size = 2); - static std::shared_ptr CreateTransformOpDef( + static void BuildTransformOpDef( const std::string &input_name, const std::vector &input_shape, const std::string &output_name, const mace::DataType dt, const OpenCLBufferType buffer_type, const MemoryType mem_type, - bool has_data_format); + DataFormat data_format, + OperatorDef *op_def); }; } // namespace mace diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 7cb97fe77cb1a7f4ee6e2e1cf41aaa0d2062070e..f1740765eee32b43ae1af78011b9dbb5b8460c01 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor( } } VLOG(1) << "Preallocate buffer to tensors"; - bool is_quantize_model = IsQuantizedModel(net_def); for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) { std::unique_ptr tensor (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id), tensor_mem.second.data_type, false, tensor_mem.first)); - if (tensor_mem.second.has_data_format) { + tensor->set_data_format(tensor_mem.second.data_format); + if (tensor_mem.second.data_format != DataFormat::NONE) { if (mem_blocks[tensor_mem.second.mem_id].mem_type() == MemoryType::GPU_IMAGE) { VLOG(1) << "Tensor: " << tensor_mem.first @@ -279,22 +279,12 @@ MaceStatus Workspace::PreallocateOutputTensor( << tensor->UnderlyingBuffer()->shape()[0] << ", " << tensor->UnderlyingBuffer()->shape()[1]; - tensor->set_data_format(DataFormat::NHWC); } else { VLOG(1) << "Tensor: " << tensor_mem.first << " Mem: " << tensor_mem.second.mem_id << " Data type: " << tensor->dtype() << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); - if (mem_blocks[tensor_mem.second.mem_id].mem_type() - == MemoryType::GPU_BUFFER || - is_quantize_model) { - tensor->set_data_format(DataFormat::NHWC); - } else { - tensor->set_data_format(DataFormat::NCHW); - } } - } else { - tensor->set_data_format(DataFormat::DF_NONE); } tensor_map_[tensor_mem.first] = std::move(tensor); } diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index bbb7c710ec2521f0946ca5d1978e622dc56220ac..054231e9b23bdb321ec36608f87bb7e665ffb651 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } } diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc index d37a62b6616b03bc476e7549b4e1b5d73357148d..46896fcd4335206f10f9a357aae5e52b98fe74ae 100644 --- a/mace/libmace/capability.cc +++ b/mace/libmace/capability.cc @@ -143,7 +143,7 @@ void BMNet::SetUp() { // Add input and output information for (size_t i = 0; i < input_names_.size(); ++i) { InputOutputInfo *info = net_.add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names_[i]); for (auto d : input_shapes_[i]) { info->add_dims(static_cast(d)); @@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type, op_def->add_output(output_name); AddIntsArg(op_def, "strides", strides); AddIntArg(op_def, "padding", padding_type); - AddIntArg(op_def, "has_data_format", 1); + AddIntArg(op_def, "data_format", static_cast(DataFormat::AUTO)); AddIntArg(op_def, "T", DT_HALF); if (has_relu6) { AddStringArg(op_def, "activation", "RELUX"); @@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name, op_def->add_output(output); AddIntArg(op_def, "type", type); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "has_data_format", 1); + AddIntArg(op_def, "data_format", static_cast(DataFormat::AUTO)); OutputShape *shape = op_def->add_output_shape(); for (auto dim : output_shape) { shape->add_dims(dim); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index fe6ea48818611aa8bfc1de1ae9f8063e2ac26944..8bad446ba69897410c95d5fb3d322c7975ab0f67 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -27,6 +27,7 @@ #include "mace/public/mace.h" #include "mace/port/env.h" #include "mace/port/file_system.h" +#include "mace/core/net_def_adapter.h" #ifdef MACE_ENABLE_OPENCL #include "mace/core/runtime/opencl/gpu_device.h" @@ -282,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector &shape, std::shared_ptr data, const DataFormat format) { MACE_CHECK_NOTNULL(data.get()); - MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC - || format == DataFormat::NCHW || format == OIHW, - "MACE only support DF_NONE, NHWC, NCHW and OIHW " + MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC + || format == DataFormat::NCHW || format == DataFormat::OIHW, + "MACE only support NONE, NHWC, NCHW and OIHW " "formats of input now."); impl_ = make_unique(); impl_->shape = shape; @@ -495,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init( DataType output_dt = output_info_map_[output_name].data_type(); Tensor *output_tensor = ws_->CreateTensor(output_name, device_->allocator(), output_dt); - output_tensor->set_data_format(NHWC); + output_tensor->set_data_format(DataFormat::NHWC); #endif } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) @@ -512,26 +513,32 @@ MaceStatus MaceEngine::Impl::Init( } } else { #endif - MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, - device_.get(), - model_data)); - - MemoryOptimizer mem_optimizer; - // Init model - net_ = std::unique_ptr(new SerialNet(op_registry_.get(), - net_def, - ws_.get(), - device_.get(), - &mem_optimizer)); - - // Preallocate all output tensors of ops - MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, - &mem_optimizer, - device_.get())); - if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); - } - MACE_RETURN_IF_ERROR(net_->Init()); + MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def, + device_.get(), + model_data)); + + NetDef adapted_net_def; + NetDefAdapter net_def_adapter(op_registry_.get(), ws_.get()); + net_def_adapter.AdaptNetDef(net_def, device_.get(), &adapted_net_def); + + MemoryOptimizer mem_optimizer; + // Init model + net_ = std::unique_ptr(new SerialNet(op_registry_.get(), + &adapted_net_def, + ws_.get(), + device_.get(), + &mem_optimizer)); + + // Preallocate all output tensors of ops + MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(adapted_net_def, + &mem_optimizer, + device_.get())); + if (device_type_ == DeviceType::GPU) { + ws_->RemoveAndReloadBuffer(adapted_net_def, + model_data, + device_->allocator()); + } + MACE_RETURN_IF_ERROR(net_->Init()); #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) } #endif @@ -578,14 +585,14 @@ MaceEngine::Impl::~Impl() { MaceStatus MaceEngine::Impl::TransposeInput( const std::pair &input, Tensor *input_tensor) { - bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE; - DataFormat data_format = DataFormat::DF_NONE; + bool has_data_format = input_tensor->data_format() != DataFormat::NONE; + DataFormat data_format = DataFormat::NONE; DataType input_dt = input_tensor->dtype(); if (has_data_format) { std::vector dst_dims; if (device_->device_type() == DeviceType::CPU && input.second.shape().size() == 4 && - input.second.data_format() == NHWC && + input.second.data_format() == DataFormat::NHWC && !is_quantized_model_) { VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; input_tensor->set_data_format(DataFormat::NCHW); @@ -647,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput( DataType output_dt = output_tensor->dtype(); // save output if (output_tensor != nullptr && output->second.data() != nullptr) { - if (output_tensor->data_format() != DataFormat::DF_NONE && - output->second.data_format() != DataFormat::DF_NONE && + if (output_tensor->data_format() != DataFormat::NONE && + output->second.data_format() != DataFormat::NONE && output->second.shape().size() == 4 && output->second.data_format() != output_tensor->data_format()) { VLOG(1) << "Transform output " << output->first << " from " - << output_tensor->data_format() << " to " - << output->second.data_format(); + << static_cast(output_tensor->data_format()) << " to " + << static_cast(output->second.data_format()); std::vector dst_dims; - if (output_tensor->data_format() == NCHW && - output->second.data_format() == NHWC) { + if (output_tensor->data_format() == DataFormat::NCHW && + output->second.data_format() == DataFormat::NHWC) { dst_dims = {0, 2, 3, 1}; - } else if (output_tensor->data_format() == NHWC && - output->second.data_format() == NCHW) { + } else if (output_tensor->data_format() == DataFormat::NHWC && + output->second.data_format() == DataFormat::NCHW) { dst_dims = {0, 3, 1, 2}; } else { LOG(FATAL) << "Not supported output data format: " - << output->second.data_format() << " vs " - << output_tensor->data_format(); + << static_cast(output->second.data_format()) << " vs " + << static_cast(output_tensor->data_format()); } VLOG(1) << "Transform output " << output->first << " from " - << output_tensor->data_format() << " to " - << output->second.data_format(); + << static_cast(output_tensor->data_format()) << " to " + << static_cast(output->second.data_format()); std::vector shape = TransposeShape(output_tensor->shape(), dst_dims); diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index bcdcd8e062b21c91b3a44bf8dd999237a385f3c6..6cb21b5c525ee0b6529348bcfcddd7acd9cfef7b 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -15,6 +15,8 @@ #include "mace/ops/activation.h" #include +#include + #include "mace/core/operator.h" #if defined(MACE_ENABLE_NEON) @@ -94,7 +96,7 @@ class ActivationOp : public Operation { auto leakyrelu_coefficient = static_cast( Operation::GetOptionalArg("leakyrelu_coefficient", 0.0f)); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( type, relux_max_limit, leakyrelu_coefficient); @@ -132,6 +134,24 @@ void RegisterActivation(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("Activation") + .SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); + if (!has_data_format || + op->output_shape(0).dims_size() != 4) { + return { DeviceType::CPU }; + } + return { DeviceType::CPU, DeviceType::GPU }; + })); } } // namespace ops diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index f16cf0604f77a1a4c2f9db90e9633e088a9a74d8..c2c9588226e91b4de6e237bf5785a18c8d1798c7 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -207,7 +207,8 @@ void TestSimplePrelu() { // Run net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Activation", "PreluTest") .Input("InputNCHW") .Input("Alpha") @@ -217,7 +218,8 @@ void TestSimplePrelu() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor( diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 1f0fa7a1fcec392d35fc36c6438adda32d2e9af7..523557cffdec564ba9706c4279dd4f20f0d933a7 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -67,7 +67,7 @@ class AddNOp : public Operation { public: explicit AddNOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -101,6 +101,24 @@ void RegisterAddN(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("AddN") + .SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); + if (!has_data_format || + op->output_shape(0).dims_size() != 4) { + return { DeviceType::CPU }; + } + return { DeviceType::CPU, DeviceType::GPU }; + })); } } // namespace ops diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/fp32/deconv_2d.cc index a80d6d645b15720a4210de9c9cdab3fc9c8401b9..41a01a6ca3c653e3412c6c1f27403c0d4c04bd11 100644 --- a/mace/ops/arm/fp32/deconv_2d.cc +++ b/mace/ops/arm/fp32/deconv_2d.cc @@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index c6559032973cdc580aa34b6fe53aaae5f8d585b3..4e303d07e79b1a5cc9d847720aede92de462f980 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -174,7 +174,7 @@ class BatchNormOp : public Operation { float leakyrelu_coefficient = Operation::GetOptionalArg( "leakyrelu_coefficient", 0.0f); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( epsilon, activation, relux_max_limit, leakyrelu_coefficient); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 495a2409a65f652373ac62c2d3150d524335103b..83c8219f9e788d24d268f89a3c0f9ff7288bcaf4 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -34,7 +34,8 @@ void Simple() { net.AddInputFromArray("Var", {1}, {11.67f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") .Input("Scale") @@ -47,7 +48,8 @@ void Simple() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BatchNorm", "BatchNormTest") .Input("Input") @@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BatchNorm", "BatchNormTest") @@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index c44501f12e73a92c942d987ac1e51a0fbd1648c9..03ac91ffb146d4e54c12d94497fb19bdec23337a 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -264,7 +264,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) : BatchToSpaceOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 9351de79518ee71671f7595f39f2c410a7e7b265..72e93fece0850710fd26aefab0cdddcddaedfc3e 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -103,7 +103,7 @@ class BiasAddOp : public Operation { : Operation(context), has_data_format_(Operation::GetOptionalArg("has_data_format", 1)) { MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { @@ -145,6 +145,24 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("BiasAdd") + .SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); + if (!has_data_format || + op->output_shape(0).dims_size() != 4) { + return { DeviceType::CPU }; + } + return { DeviceType::CPU, DeviceType::GPU }; + })); } } // namespace ops diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 34f6a713b3429fbf9da955b20df917f4a0b8bc32..8c51b70361ea02ecdbc7ae8ba8dc00727ea16dd8 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { OpsTestNet net; // Add input data - DataFormat data_format = NHWC; if (D == DeviceType::CPU) { - data_format = NCHW; net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 2e4764cac8ad2cf1f303a2e53c64fda444023fa3..0126abb9d20645c51925e218bdc881fc3801fd5b 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -31,8 +31,8 @@ void BiasAddSimple() { net.AddInputFromArray("Bias", {1}, {0.5f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") @@ -41,8 +41,8 @@ void BiasAddSimple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("Input") @@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { {batch, height, width, channels}); net.AddRandomInput("Bias", {channels}, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BiasAdd", "BiasAddTest") @@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { {batch, height, width, channels}); net.AddRandomInput("Bias", {channels}, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BiasAdd", "BiasAddTest") @@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 92733d61b0f028074604b7840202507768b70e38..2a8c42b3a142e723efb8ed6014bab9f486f5e9eb 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -48,7 +48,6 @@ void FilterBufferToImage(int iters, OpenCLBufferType::IN_OUT_CHANNEL, MemoryType::GPU_IMAGE, 0, - DataFormat::NHWC, b2i_output); }; diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index a819b6a703859b2c111f23b3971eddd36a670be4..cb52eafe19bf27f926c36653889942a232edb2c5 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DT_FLOAT); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type, // Transform OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index 229d4eb9657432f7966368da759cb0b497972ee9..7e59b339642b571b7bc08f09af1b07814096eaf0 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -39,14 +39,11 @@ class BufferTransformOp : public Operation { auto type = static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(CONV2D_FILTER))); - bool has_data_format = Operation::GetOptionalArg("has_data_format", 0) - != 0; MemoryType in_mem_type = context->workspace()->GetTensor( operator_def_->input(0))->memory_type(); return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( - context, input, type, out_mem_type_, wino_blk_size_, - has_data_format, output); + context, input, type, out_mem_type_, wino_blk_size_, output); } private: diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc index b3f68a31ae854726e56b93f626c3bcb4ba24dac3..a9af4bc9943fceb62d61e9ec7b13a58188230e83 100644 --- a/mace/ops/buffer_transform_test.cc +++ b/mace/ops/buffer_transform_test.cc @@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output); + type, MemoryType::GPU_BUFFER, 0, bt_output); // Inverse Transform Tensor *output = net.ws()->CreateTensor( @@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_BUFFER) .Transform(&context, bt_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output); + type, MemoryType::GPU_BUFFER, 0, output); if (DataTypeToEnum::value == DataTypeToEnum::value) { EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), @@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) { MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER, - 0, DataFormat::NHWC, output); + 0, output); index_t expected_size = RoundUp(input_size, 4); EXPECT_EQ(expected_size, output->buffer_shape()[0]); diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 966b5d57347b9405d3d43d9c113b00de3d38ce3e..d68ebbbec9d8c03ee4045c92cf4258f9326dcca8 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -82,7 +82,7 @@ class ChannelShuffleOp : public Operation { explicit ChannelShuffleOp(OpConstructContext *context) : Operation(context) { const int groups = Operation::GetOptionalArg("group", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(groups); } else { MACE_NOT_IMPLEMENTED; @@ -116,7 +116,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { op_registry, OpConditionBuilder("ChannelShuffle") .SetDevicePlacerFunc( - [](OpConstructContext *context) -> std::set { + [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { return { DeviceType::CPU, DeviceType::GPU }; diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index d59b45d8fdf7a5827f5f5b18e64d823a9166f108..4e25448bc91b472fc239747aceb1f1a57ec07348 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { "Input", {1, 1, 2, 8}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") @@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor( diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc index 2ca95a7d75986c03c81d80f9ce0365d53df7005b..4398888174675cb202cccefcf4cb374b97925aca 100644 --- a/mace/ops/common/conv_pool_2d_util.cc +++ b/mace/ops/common/conv_pool_2d_util.cc @@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, index_t input_height = 0, input_width = 0; index_t kernel_height = 0, kernel_width = 0; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { input_height = input_shape[2]; input_width = input_shape[3]; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { input_height = input_shape[1]; input_width = input_shape[2]; } else { MACE_NOT_IMPLEMENTED; } - if (filter_format == OIHW) { + if (filter_format == DataFormat::OIHW) { kernel_height = filter_shape[2]; kernel_width = filter_shape[3]; - } else if (filter_format == OHWI) { + } else if (filter_format == DataFormat::OHWI) { kernel_height = filter_shape[1]; kernel_width = filter_shape[2]; } else { @@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, 0, (output_width - 1) * strides[1] + k_extent_width - input_width); output_shape[0] = input_shape[0]; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { output_shape[1] = output_channels; output_shape[2] = output_height; output_shape[3] = output_width; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { output_shape[1] = output_height; output_shape[2] = output_width; output_shape[3] = output_channels; @@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW Padding padding, index_t *output_shape, int *padding_size) { - CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations, + CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape, + DataFormat::OIHW, dilations, strides, padding, output_shape, padding_size); } @@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC Padding padding, index_t *output_shape, int *padding_size) { - CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations, + CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape, + DataFormat::OIHW, dilations, strides, padding, output_shape, padding_size); } @@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape, index_t input_height = 0, input_width = 0; index_t kernel_height = 0, kernel_width = 0; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { input_height = input_shape[2]; input_width = input_shape[3]; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { input_height = input_shape[1]; input_width = input_shape[2]; } else { MACE_NOT_IMPLEMENTED; } - if (filter_format == OIHW) { + if (filter_format == DataFormat::OIHW) { kernel_height = filter_shape[2]; kernel_width = filter_shape[3]; - } else if (filter_format == OHWI) { + } else if (filter_format == DataFormat::OHWI) { kernel_height = filter_shape[1]; kernel_width = filter_shape[2]; } else { @@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape, } output_shape[0] = input_shape[0]; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { output_shape[1] = output_channels; output_shape[2] = output_height; output_shape[3] = output_width; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { output_shape[1] = output_height; output_shape[2] = output_width; output_shape[3] = output_channels; @@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape, // NHWC const int *strides, const RoundType round_type, index_t *output_shape) { - CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations, + CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape, + DataFormat::OIHW, padding_size, dilations, strides, round_type, output_shape); } @@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW const int *strides, const RoundType round_type, index_t *output_shape) { - CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations, + CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape, + DataFormat::OIHW, padding_size, dilations, strides, round_type, output_shape); } @@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector &input_shape, std::vector *padded_out_shape, DataFormat data_format) { const index_t - in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + in_height = + data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1]; const index_t - in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + in_width = + data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2]; const index_t - out_height = data_format == NCHW ? output_shape[2] : output_shape[1]; + out_height = + data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1]; const index_t - out_width = data_format == NCHW ? output_shape[3] : output_shape[2]; + out_width = + data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2]; const index_t extended_in_height = (in_height - 1) * strides[0] + 1; const index_t extended_in_width = (in_width - 1) * strides[1] + 1; @@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector &input_shape, padded_out_shape->resize(4); (*padded_out_shape)[0] = output_shape[0]; (*padded_out_shape)[1] = - data_format == NCHW ? output_channel : padded_out_height; + data_format == DataFormat::NCHW ? output_channel : padded_out_height; (*padded_out_shape)[2] = - data_format == NCHW ? padded_out_height : padded_out_width; + data_format == DataFormat::NCHW ? padded_out_height : padded_out_width; (*padded_out_shape)[3] = - data_format == NCHW ? padded_out_width : output_channel; + data_format == DataFormat::NCHW ? padded_out_width : output_channel; } } @@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, std::vector *padded_out_shape, DataFormat data_format) { const index_t - in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + in_height = + data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1]; const index_t - in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + in_width = + data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2]; const index_t output_channel = filter_shape[0] * group; @@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, padded_out_shape->resize(4); (*padded_out_shape)[0] = input_shape[0]; (*padded_out_shape)[1] = - data_format == NCHW ? output_channel : padded_out_height; + data_format == DataFormat::NCHW ? output_channel : padded_out_height; (*padded_out_shape)[2] = - data_format == NCHW ? padded_out_height : padded_out_width; + data_format == DataFormat::NCHW ? padded_out_height : padded_out_width; (*padded_out_shape)[3] = - data_format == NCHW ? padded_out_width : output_channel; + data_format == DataFormat::NCHW ? padded_out_width : output_channel; } if (out_shape != nullptr) { @@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, index_t out_width = padded_out_width - out_pad_size[1]; out_shape->resize(4); (*out_shape)[0] = input_shape[0]; - (*out_shape)[1] = data_format == NCHW ? output_channel : out_height; - (*out_shape)[2] = data_format == NCHW ? out_height : out_width; - (*out_shape)[3] = data_format == NCHW ? out_width : output_channel; + (*out_shape)[1] = + data_format == DataFormat::NCHW ? output_channel : out_height; + (*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width; + (*out_shape)[3] = + data_format == DataFormat::NCHW ? out_width : output_channel; } } @@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, MACE_CHECK(output_shape->size() == 4, "deconv output shape shoud be 4-dims"); std::vector &out_shape = *output_shape; - if (data_format == NCHW) { + if (data_format == DataFormat::NCHW) { const index_t t = out_shape[1]; out_shape[1] = out_shape[3]; out_shape[3] = out_shape[2]; diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index 9fa45feb69e2ac9c7a5feb65f5f87dce44a82e2e..518e9cc2b5b9b0d8ff54308e60bc5a3c55e52f42 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -199,7 +199,7 @@ class ConcatOp : public ConcatOpBase { public: explicit ConcatOp(OpConstructContext *context) : ConcatOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -241,12 +241,12 @@ void RegisterConcat(OpRegistryBase *op_registry) { op_registry, OpConditionBuilder("Concat") .SetDevicePlacerFunc( - [](OpConstructContext *context) -> std::set { + [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); - auto tensor_shape_info = context->tensor_shape_info(); if (op->output_shape_size() != op->output_size()) { return { DeviceType::CPU, DeviceType::GPU }; } + auto tensor_shape_info = context->tensor_shape_info(); if (op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } else { diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 5fefeddcd1c523c0da1c3f1c384119f4865b361e..cc84b9632df9d4b6013d08d2381677bb38bd7d47 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -231,9 +231,9 @@ class Conv2dOp : public ConvPool2dOpBase { std::vector paddings(2); if (paddings_.empty()) { CalcPaddingAndOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, filter->shape().data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -242,9 +242,9 @@ class Conv2dOp : public ConvPool2dOpBase { } else { paddings = paddings_; CalcOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, filter->shape().data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -459,14 +459,13 @@ class Conv2dOp : public ConvPool2dOpBase { "leakyrelu_coefficient", 0.0f)), wino_block_size_(Operation::GetOptionalArg("wino_block_size", 0)) { MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { mem_type = MemoryType::GPU_BUFFER; kernel_ = make_unique>(); } - context->set_output_mem_type(mem_type); // Transform filter tensor to target format if ((wino_block_size_ == 2 || wino_block_size_ == 4) && (kernel_->CheckUseWinograd( diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 7fb854787c032a5106c065d92830729d8243e9a1..42929057cb12b9515993f33ac62dfbbb0790d658 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) { const std::vector output_shape = {1, 3, 3, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -256,8 +256,8 @@ void TestNHWCCombined3x3() { net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -270,8 +270,8 @@ void TestNHWCCombined3x3() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -459,8 +459,8 @@ void TestConv1x1() { net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -472,8 +472,8 @@ void TestConv1x1() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true, false); net.AddRandomInput("Bias", {output_channels}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") @@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, float_bias_data, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") @@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch, net.AddRandomInput("Filter", {out_channels, k_height, k_width, in_channels}, true); net.AddRandomInput("Bias", {out_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.TransformFilterDataFormat("Filter", - OHWI, + DataFormat::OHWI, "FilterOIHW", - OIHW); + DataFormat::OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch, .AddIntArg("T", static_cast(DT_FLOAT)) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeFilter") .Input("Filter") diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 7265208efdd3d62d682c1689b82049ce2dd42e07..20146c8d05eb728ae54711af0883da5cf6e38bca 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -117,7 +117,7 @@ class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( Operation::GetRepeatedArgs("offset")); } else { @@ -145,6 +145,24 @@ void RegisterCrop(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Crop", CropOp, DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("Crop") + .SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); + if (!has_data_format || + op->output_shape(0).dims_size() != 4) { + return { DeviceType::CPU }; + } + return { DeviceType::CPU, DeviceType::GPU }; + })); } } // namespace ops diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index 213b8ce89a58b5745c4e5685c6a825442b5826ce..0fd0026b2ff3ba350d30c7daebab236d43033f0d 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -42,13 +42,13 @@ void RunCrop(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); } else if (D == CPU) { net.TransformDataFormat("Input0", - NHWC, + DataFormat::NHWC, "InputNCHW0", - NCHW); + DataFormat::NCHW); net.TransformDataFormat("Input1", - NHWC, + DataFormat::NHWC, "InputNCHW1", - NCHW); + DataFormat::NCHW); OpDefBuilder("Crop", "CropTest") .Input("InputNCHW0") .Input("InputNCHW1") @@ -62,8 +62,8 @@ void RunCrop(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check auto expected = net.CreateTensor(expected_shape, expected_data); diff --git a/mace/ops/cumsum_test.cc b/mace/ops/cumsum_test.cc index 8b111540c9040a391ae419d86e3c042b23954b5e..69e629653b79fd66c409a55f3ed5438fc0826b67 100644 --- a/mace/ops/cumsum_test.cc +++ b/mace/ops/cumsum_test.cc @@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector &shape, OpsTestNet net; net.AddInputFromArray("Input", shape, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Cumsum", "CumsumTest") .Input("InputNCHW") @@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector &shape, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 5692425ad10ba05f92fdf06c428106bdf15455a9..2b7623e6d48cf5738bccbbed6c7cf30820342f19 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -173,7 +173,7 @@ class Deconv2dOp : public Deconv2dOpBase { explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -197,7 +197,6 @@ class Deconv2dOp : public Deconv2dOpBase { OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } - context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32); } } MaceStatus Run(OpContext *context) override { @@ -241,7 +240,7 @@ class Deconv2dOp : public Deconv2dOpBase { &out_paddings, nullptr, model_type_, - NHWC); + DataFormat::NHWC); return kernel_->Compute(context, input, filter, bias, strides_.data(), in_paddings.data(), activation_, @@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, DeviceType::GPU, half); + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("Deconv2D") + .SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (context->device()->device_type() == DeviceType::GPU) { + if (context->device()->gpu_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; + } else { + MACE_NOT_IMPLEMENTED; + } + FrameworkType framework_type = + static_cast( + ProtoArgHelper::GetOptionalArg( + *(context->operator_def()), "framework_type", + FrameworkType::TENSORFLOW)); + if (framework_type == FrameworkType::TENSORFLOW) { + context->SetInputInfo(2, MemoryType::CPU_BUFFER, + DataType::DT_INT32); + } + } + context->set_output_mem_type(mem_type); + })); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 25aa7eeeeed80e6403c125ec101a95c536eebe2c..9ea8161ef47de3e40e4f1260e00ead158e48d740 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -47,7 +47,8 @@ void RunTestSimple(const std::vector &input_shape, net.AddInputFromArray("Filter", filter_shape, filter_data, true); net.AddInputFromArray("Bias", {out_channels}, bias_data, true); // TODO(liutuo): remove the unused transform - net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.TransformFilterDataFormat( + "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW); if (D == DeviceType::GPU) { if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") @@ -77,8 +78,8 @@ void RunTestSimple(const std::vector &input_shape, } net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") @@ -109,8 +110,8 @@ void RunTestSimple(const std::vector &input_shape, // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(expected_shape, expected_data); @@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true, false); net.AddRandomInput("Bias", {output_channels}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); int out_h = 0; int out_w = 0; @@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 09208e7abf1194455450cb038343b0e79c65891f..a57ddecfae2ddbcc78b93d601382c3a2933fafac 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -96,7 +96,7 @@ class DepthToSpaceOp : public Operation { explicit DepthToSpaceOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 2719619fe4a858a3ff61df3c85d4d58708ea88ac..65fb7d39e3f3ace225db18969648e64959a71455 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); // Construct graph if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("DepthToSpace", "DepthToSpaceTest") @@ -114,8 +114,8 @@ void RandomTest(const int block_size, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .AddIntArg("block_size", block_size) @@ -125,8 +125,8 @@ void RandomTest(const int block_size, // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("Input") diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 67339ef9c5a78ef37369e4b6c197781dea5690db..ae2a4dfda760e2fe9d182a510fc353ef2d73c363 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -188,9 +188,9 @@ class DepthwiseConv2dOp filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1}; if (paddings_.empty()) { CalcPaddingAndOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, ohwi_shape.data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -199,9 +199,9 @@ class DepthwiseConv2dOp } else { paddings = paddings_; CalcOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, ohwi_shape.data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -375,14 +375,13 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { mem_type = MemoryType::GPU_BUFFER; kernel_ = make_unique>(); } - context->set_output_mem_type(mem_type); Tensor *filter_tensor = context->workspace()->GetTensor( operator_def_->input(1)); if (filter_tensor != nullptr && filter_tensor->is_weight()) { @@ -393,8 +392,6 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { 1, OpenCLBufferType::DW_CONV2D_FILTER, mem_type) == MaceStatus::MACE_SUCCESS); - } else { - context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER); } if (operator_def_->input_size() > 2) { MACE_CHECK(TransformFilter( @@ -440,7 +437,40 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", DepthwiseConv2dOp, DeviceType::GPU, half); + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("DepthwiseConv2d") + .SetInputMemoryTypeSetter( + [](OpConditionContext *context) -> void { + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (context->device()->device_type() == DeviceType::GPU) { + if (context->device()->gpu_runtime()->UseImageMemory()) { + mem_type = MemoryType::GPU_IMAGE; + } else { + mem_type = MemoryType::GPU_BUFFER; + } + auto filter_tensor = context->workspace()->GetTensor( + context->operator_def()->input(1)); + if (filter_tensor == nullptr || !filter_tensor->is_weight()) { + context->SetInputOpenCLBufferType( + 1, OpenCLBufferType::DW_CONV2D_FILTER); + } + } + context->set_output_mem_type(mem_type); + })); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("DepthwiseConv2d") + .SetInputsDataFormatSelector( + [](OpConditionContext *context) -> std::vector { + DataFormat op_data_format = + static_cast( + ProtoArgHelper::GetOptionalArg( + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); + return {op_data_format, DataFormat::OIHW, DataFormat::NONE}; + })); } } // namespace ops diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 58852a012e84fb6664331708738adcd180519e5d..d34722a5bc02025ccbafe285fcc7f2bb8759db7f 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -39,8 +39,8 @@ void SimpleValidTest() { true); net.AddInputFromArray("Bias", {2}, {.1f, .2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -52,8 +52,8 @@ void SimpleValidTest() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") @@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") @@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) { {multiplier * channel}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) { // Run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -389,9 +389,9 @@ void TestQuant(const index_t batch, "Filter", {k_height, k_width, in_channels, multiplier}, true, false); net.AddRandomInput("Bias", {out_channels}, true); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.TransformFilterDataFormat( - "Filter", HWIO, "FilterOIHW", OIHW); + "Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") @@ -405,7 +405,7 @@ void TestQuant(const index_t batch, .Finalize(net.NewOperatorDef()); net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeFilter") .Input("Filter") diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 6111ea3062b241514fccca9167410f6314e4fcaf..31b634af11ed9756fbb14eddd91d519a7224d1d6 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { explicit DepthwiseDeconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { &out_paddings, nullptr, CAFFE, - NHWC); + DataFormat::NHWC); return kernel_->Compute(context, input, diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc index 0cf3de95bf5c2d077e062dcde07a232977ff8ba6..fda0cf59b8d7182c896ee55b6290e1af02211ca3 100644 --- a/mace/ops/depthwise_deconv2d_test.cc +++ b/mace/ops/depthwise_deconv2d_test.cc @@ -39,7 +39,8 @@ void RunTestSimple(const int group, // Add input data net.AddInputFromArray("Input", input_shape, input_data); net.AddInputFromArray("Filter", filter_shape, filter_data, true); - net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.TransformFilterDataFormat( + "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW); const index_t out_channels = expected_shape[3]; net.AddInputFromArray("Bias", {out_channels}, bias_data, true); @@ -56,8 +57,8 @@ void RunTestSimple(const int group, net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, - "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") .Input("InputNCHW") .Input("FilterOIHW") @@ -69,8 +70,8 @@ void RunTestSimple(const int group, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(expected_shape, expected_data); @@ -193,8 +194,8 @@ void RandomTest(index_t batch, {channel * multiplier}, bias_data, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") .Input("InputNCHW") .Input("Filter") @@ -210,8 +211,8 @@ void RandomTest(index_t batch, .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 04c0e10e323a53d9e3efb042366c4ff6cc1b666d..bfe0074289363169ab41af72db5489b343ff2c84 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -1145,7 +1145,7 @@ class EltwiseOp : public Operation { int32_t scalar_input_index = Operation::GetOptionalArg( "scalar_input_index", 1); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( type, coeff, scalar_input, scalar_input_index); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 58306b625a5ce8e38b0b129c230a4401d3a06ae9..08dc11d00346abe50baca029352bd367ca9b6c91 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type, net.AddInputFromArray("Input", shape, input); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "TInput", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput") .AddIntArg("T", DataTypeToEnum::v()) @@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input") @@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type, .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput"); if (shape0.size() > 1) { - net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); op_builder.Input("TInput0"); } else { op_builder.Input("Input0"); } if (shape1.size() > 1) { - net.TransformDataFormat("Input1", NHWC, "TInput1", NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); op_builder.Input("TInput1"); } else { op_builder.Input("Input1"); @@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type, // Run net.RunOp(D); - net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input0") @@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type, net.AddInputFromArray( "Filter", {output_shape.back(), shape0.back(), 3, 3}, - std::vector(output_shape.back() * shape0.back() * 9, 1)); + std::vector(output_shape.back() * shape0.back() * 9, 1), + true); OpDefBuilder("Conv2D", "Conv2D") .AddIntArg("T", DataTypeToEnum::v()) .Input("EltOutput") @@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type, // Add input data net.AddRandomInput("Input", shape, false, true, true); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput") .AddIntArg("type", static_cast(type)) @@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type, true, true); - net.TransformDataFormat("Input0", NHWC, "TInput0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "TInput1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") .Input("TInput1") @@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -746,10 +752,10 @@ void Quantized(const std::vector &shape, true, true); - net.TransformDataFormat("Input0", NHWC, "TInput0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "TInput1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") @@ -761,8 +767,8 @@ void Quantized(const std::vector &shape, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput0") .Input("Input0") diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc index 78fed15619553b3903d8c71015b4d4228f6a5c7a..5474dd4bc26f50836271a2073be7e5f28f1f0ffe 100644 --- a/mace/ops/expand_dims.cc +++ b/mace/ops/expand_dims.cc @@ -14,7 +14,6 @@ #include "mace/core/operator.h" -#include "mace/ops/common/transpose.h" #include "mace/utils/math.h" namespace mace { @@ -44,27 +43,8 @@ class ExpandDimsOp : public Operation { std::vector output_shape(input_shape); output_shape.insert(output_shape.begin() + axis_, 1); - bool has_data_format = Operation::GetOptionalArg( - "has_data_format", 0) == 1; - if (has_data_format && output_shape.size() == 4) { - // only tensorflow support expand dim, so the default format is NHWC - // transform NHWC to NCHW - auto t_output_shape = TransposeShape(output_shape, - {0, 3, 1, 2}); - output->Resize(t_output_shape); - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard output_guard(output); - auto input_data = input->data(); - auto output_data = output->mutable_data(); - - Transpose(&context->device()->cpu_runtime()->thread_pool(), - input_data, output_shape, {0, 3, 1, 2}, output_data); - } else { - output->Resize(output_shape); - Tensor::MappingGuard input_guard(input); - auto input_data = input->data(); - output->Copy(input_data, input->size()); - } + output->ReuseTensorBuffer(*input); + output->Reshape(output_shape); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 5be44e05dc2140e8a7386591fe8df18a4426283b..fb0c45bb19f6aa51eeb17e5c6e6697ce96390bbe 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -49,7 +49,8 @@ void Simple() { net.AddInputFromArray("Offset", {1}, offset, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") @@ -58,7 +59,8 @@ void Simple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("Input") @@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 64765d9c99f6a9ade2b8ef7a1a2cdd5874f3c243..9a371b16566c714cc8c352bc7b6a4b1382a9695e 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -190,7 +190,7 @@ class FullyConnectedOp : public FullyConnectedOpBase { explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context) { MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 64fead6e05bc4a1d552d20e55a8645b589751968..586eb166459dc2267a204a8cbdd0652252d5c345 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -48,7 +48,8 @@ void Simple(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") @@ -129,8 +130,8 @@ void Random(const index_t batch, net.AddRandomInput("Bias", {out_channel}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputNCHW") .Input("Weight") @@ -143,7 +144,8 @@ void Random(const index_t batch, // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -215,8 +217,10 @@ void QuantRandom(const index_t batch, net.AddRandomInput( "Weight", {out_channel, height, width, channels}, true); net.AddRandomInput("Bias", {out_channel}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformFilterDataFormat("Weight", OHWI, "WeightOIHW", OIHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + net.TransformFilterDataFormat( + "Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputNCHW") @@ -226,7 +230,8 @@ void QuantRandom(const index_t batch, .AddIntArg("T", DT_FLOAT) .Finalize(net.NewOperatorDef()); net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeWeight") .Input("Weight") diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc index e35970066f71691c002017e776edff217e56f44c..9a2d2cdfc422b503b729fd81fef89104508dab3e 100644 --- a/mace/ops/local_response_norm_test.cc +++ b/mace/ops/local_response_norm_test.cc @@ -29,7 +29,8 @@ void Simple() { {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest") .Input("InputNCHW") @@ -41,7 +42,8 @@ void Simple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc index 82ed9053b6d05a40c2e31e6854c0ec16c62f7ae8..d43dbf6bd462da56cf73a7eedca8e8863a089dbf 100644 --- a/mace/ops/lstm_cell.cc +++ b/mace/ops/lstm_cell.cc @@ -36,7 +36,7 @@ class LSTMCellOp : public Operation { Operation::GetOptionalArg("scalar_input", 0.0)); MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(forget_bias); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 65df7305ea769cbbfd5a6c5ebfa8a779b95fe954..b662ce2ee97859051d1c34553d1519dc5939c99f 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, DeviceType::CPU, uint8_t); #endif // MACE_ENABLE_QUANTIZE - -#ifdef MACE_ENABLE_OPENCL - MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, - DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, - DeviceType::GPU, half); -#endif // MACE_ENABLE_OPENCL } } // namespace ops diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index 20dc6d1ac9da37ca99bc70eed9905afbfd89ceb7..d2ef505825eceee5dfb43629ddc250636f952540 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -23,7 +23,6 @@ #include "mace/ops/opencl/image/buffer_to_image.h" #include "mace/ops/opencl/image/image_to_buffer.h" #include "mace/ops/opencl/buffer/buffer_transform.h" -#include "mace/ops/common/transpose.h" #include "mace/utils/memory.h" namespace mace { @@ -48,7 +47,6 @@ class OpenCLBufferTransformer { const OpenCLBufferType type, const MemoryType out_mem_type, const int wino_blk_size, - bool has_data_format, Tensor *output) { Workspace *ws = context->workspace(); DataType dt = DataTypeToEnum::value; @@ -67,31 +65,11 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform CPU Buffer " << input->name() << " to GPU Buffer " << internal_tensor->name() << " with data type " << dt; - if (has_data_format && input->shape().size() == 4) { - // 1. (NCHW -> NHWC) - std::vector dst_dims = {0, 2, 3, 1}; - std::vector output_shape = - TransposeShape(input->shape(), - dst_dims); - internal_tensor->Resize(output_shape); - internal_tensor->set_data_format(DataFormat::NHWC); - // TODO(liuqi): Only support float now - const float *input_ptr = input->data(); - Tensor::MappingGuard guard(internal_tensor); - float *internal_ptr = internal_tensor->mutable_data(); - MACE_RETURN_IF_ERROR(ops::Transpose( - &context->device()->cpu_runtime()->thread_pool(), - input_ptr, - input->shape(), - dst_dims, - internal_ptr)); - } else { - internal_tensor->Resize(input->shape()); - const uint8_t *input_ptr = input->data(); - Tensor::MappingGuard guard(internal_tensor); - uint8_t *internal_ptr = internal_tensor->mutable_data(); - memcpy(internal_ptr, input_ptr, input->raw_size()); - } + internal_tensor->Resize(input->shape()); + const uint8_t *input_ptr = input->data(); + Tensor::MappingGuard guard(internal_tensor); + uint8_t *internal_ptr = internal_tensor->mutable_data(); + memcpy(internal_ptr, input_ptr, input->raw_size()); // 2. convert the internal GPU Buffer to output. return kernel_->Compute( context, internal_tensor, type, wino_blk_size, output); @@ -108,30 +86,12 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() << " to CPU Buffer " << output->name() << " with data type " << dt; - if (has_data_format && internal_tensor.shape().size() == 4) { - // NHWC -> NCHW - std::vector dst_dims = {0, 3, 1, 2}; - std::vector output_shape = - TransposeShape(internal_tensor.shape(), - dst_dims); - output->set_data_format(DataFormat::NCHW); - Tensor::MappingGuard guard(&internal_tensor); - const float *internal_ptr = internal_tensor.data(); - output->Resize(output_shape); - float *output_ptr = output->mutable_data(); - return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(), - internal_ptr, - internal_tensor.shape(), - dst_dims, - output_ptr); - } else { - Tensor::MappingGuard guard(&internal_tensor); - const T *internal_ptr = internal_tensor.data(); - output->Resize(internal_tensor.shape()); - T *output_ptr = output->mutable_data(); - memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); - return MaceStatus::MACE_SUCCESS; - } + Tensor::MappingGuard guard(&internal_tensor); + const T *internal_ptr = internal_tensor.data(); + output->Resize(internal_tensor.shape()); + T *output_ptr = output->mutable_data(); + memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + return MaceStatus::MACE_SUCCESS; } else { LOG(FATAL) << "Unexpected error: " << out_mem_type; return MaceStatus::MACE_SUCCESS; @@ -172,7 +132,7 @@ MaceStatus TransformFilter( input->MarkUnused(); return OpenCLBufferTransformer(input->memory_type(), mem_type). Transform(&op_context, input, buffer_type, mem_type, wino_blk_size, - DataFormat::DF_NONE, output); + output); } } // namespace ops diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h index bc1a702532fcfec6f32866fc332bdfe717f79416..9c8a1a3133e63d7e8c486ca292f86f0fa2b981db 100644 --- a/mace/ops/opencl/image/eltwise.h +++ b/mace/ops/opencl/image/eltwise.h @@ -71,14 +71,17 @@ MaceStatus EltwiseKernel::Compute( if (input1 == nullptr) { input1_type = "INPUT_SCALAR"; } else { - MACE_CHECK(input0->dim_size() == input1->dim_size() || + MACE_CHECK((input0->dim_size() == input1->dim_size() + && input0->dim_size() == 4) || input0->dim_size() == 1 || input1->dim_size() == 1) - << "Inputs of Eltwise op must be same shape"; + << "Inputs of Eltwise op must be same shape or fulfill broadcast logic"; MACE_CHECK(type_ != EltwiseType::EQUAL) << "Eltwise op on GPU does not support EQUAL"; // broadcast - if (input0->size() != input1->size()) { - if (input0->size() < input1->size()) { + if (input0->size() != input1->size() || + input0->dim_size() != input1->dim_size()) { + if (input0->size() < input1->size() + || input0->dim_size() < input1->dim_size()) { std::swap(input0, input1); swapped = true; } diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index a2bdc65280fd82cdd244c0c949e2753765a3bf6d..fa69a11621c5f395be237bed7867c356b576a844 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -59,11 +59,6 @@ MaceStatus ReduceKernel::Compute( const Tensor *input, Tensor *output) { MACE_CHECK_NOTNULL(input); - MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims."); - MACE_CHECK(input->dim_size() == 4, - "reduce gpu only support 4-dim input"); - MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2, - "reduce gpu only support 1,2-axis reduce"); index_t batch = input->dim(0); const index_t in_height = input->dim(1); const index_t in_width = input->dim(2); diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index bcf1282d2211fe5ae022aced1fa5a896c3545b44..a0761101b1f83949eaaa371da3c1451e249373f4 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -15,6 +15,7 @@ #include "mace/ops/ops_test_util.h" #include "mace/core/memory_optimizer.h" #include "mace/utils/memory.h" +#include "mace/core/net_def_adapter.h" namespace mace { namespace ops { @@ -175,26 +176,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() { bool OpsTestNet::Setup(mace::DeviceType device) { NetDef net_def; for (auto &op_def : op_defs_) { - net_def.add_op()->CopyFrom(op_def); - + auto target_op = net_def.add_op(); + target_op->CopyFrom(op_def); + + auto has_data_format = ProtoArgHelper::GetOptionalArg( + op_def, "has_data_format", 0); + auto is_quantized_op = ProtoArgHelper::GetOptionalArg( + op_def, "T", static_cast(DT_FLOAT)) + == static_cast(DT_UINT8); for (auto input : op_def.input()) { if (ws_.GetTensor(input) != nullptr && !ws_.GetTensor(input)->is_weight()) { auto input_info = net_def.add_input_info(); input_info->set_name(input); - auto has_data_format = ProtoArgHelper::GetOptionalArg( - op_def, "has_data_format", 1); - auto is_quantized_op = ProtoArgHelper::GetOptionalArg( - op_def, "T", static_cast(DT_FLOAT)) - == static_cast(DT_UINT8); if (has_data_format) { if (is_quantized_op || device == DeviceType::GPU) { - input_info->set_data_format(NHWC); + input_info->set_data_format(static_cast(DataFormat::NHWC)); } else { - input_info->set_data_format(NCHW); + input_info->set_data_format(static_cast(DataFormat::NCHW)); } } else { - input_info->set_data_format(DataFormat::DF_NONE); + input_info->set_data_format(static_cast(DataFormat::NONE)); } auto &shape = ws_.GetTensor(input)->shape(); for (auto d : shape) { @@ -202,6 +204,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } + if (has_data_format) { + SetProtoArg(target_op, "data_format", + static_cast(DataFormat::AUTO)); + } } if (!op_defs_.empty()) { auto op_def = op_defs_.back(); @@ -216,15 +222,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } + NetDef adapted_net_def; + NetDefAdapter net_def_adapter(op_registry_.get(), &ws_); + net_def_adapter.AdaptNetDef(&net_def, + OpTestContext::Get()->GetDevice(device), + &adapted_net_def); + MemoryOptimizer mem_optimizer; net_ = make_unique( op_registry_.get(), - &net_def, + &adapted_net_def, &ws_, OpTestContext::Get()->GetDevice(device), &mem_optimizer); MaceStatus status = (ws_.PreallocateOutputTensor( - net_def, + adapted_net_def, &mem_optimizer, OpTestContext::Get()->GetDevice(device))); if (status != MaceStatus::MACE_SUCCESS) return false; @@ -267,15 +279,20 @@ MaceStatus OpsTestNet::RunOp() { MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def, const mace::DeviceType device) { device_type_ = device; + NetDef adapted_net_def; + NetDefAdapter net_def_adapter(op_registry_.get(), &ws_); + net_def_adapter.AdaptNetDef(&net_def, + OpTestContext::Get()->GetDevice(device), + &adapted_net_def); MemoryOptimizer mem_optimizer; net_ = make_unique( op_registry_.get(), - &net_def, + &adapted_net_def, &ws_, OpTestContext::Get()->GetDevice(device), &mem_optimizer); MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor( - net_def, + adapted_net_def, &mem_optimizer, OpTestContext::Get()->GetDevice(device))); MACE_RETURN_IF_ERROR(net_->Init()); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index d2212a659078075a60df305db95d5dee1b0cd584..bdc67037c4dd3fc897757dc3d1c95ab0f6e4267d 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -223,7 +223,7 @@ class OpsTestNet { const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); - if (src_format == NHWC && dst_format == NCHW) { + if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) { index_t batch = input_shape[0]; index_t height = input_shape[1]; index_t width = input_shape[2]; @@ -243,7 +243,8 @@ class OpsTestNet { } } } - } else if (src_format == NCHW && dst_format == NHWC) { + } else if (src_format == DataFormat::NCHW && + dst_format == DataFormat::NHWC) { index_t batch = input_shape[0]; index_t channels = input_shape[1]; index_t height = input_shape[2]; @@ -281,7 +282,7 @@ class OpsTestNet { input->is_weight()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); - if (src_format == HWOI && dst_format == OIHW) { + if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) { index_t height = input_shape[0]; index_t width = input_shape[1]; index_t out_channels = input_shape[2]; @@ -299,7 +300,8 @@ class OpsTestNet { input_data[j * out_channels * in_channels + i]; } } - } else if (src_format == OIHW && dst_format == HWOI) { + } else if (src_format == DataFormat::OIHW && + dst_format == DataFormat::HWOI) { index_t out_channels = input_shape[0]; index_t in_channels = input_shape[1]; index_t height = input_shape[2]; @@ -317,7 +319,8 @@ class OpsTestNet { input_data[j * height * width + i]; } } - } else if (src_format == HWIO && dst_format == OIHW) { + } else if (src_format == DataFormat::HWIO && + dst_format == DataFormat::OIHW) { index_t height = input_shape[0]; index_t width = input_shape[1]; index_t in_channels = input_shape[2]; @@ -337,7 +340,8 @@ class OpsTestNet { } } } - } else if (src_format == OHWI && dst_format == OIHW) { + } else if (src_format == DataFormat::OHWI && + dst_format == DataFormat::OIHW) { index_t out_channels = input_shape[0]; index_t height = input_shape[1]; index_t width = input_shape[2]; diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index e0a94f4a7f5b2f6a00eddd816b3b92ae9da816d1..24130d7ae381222fb6219b4d335afc4a9e0c5723 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -179,7 +179,7 @@ class PadOp : public Operation { std::vector paddings = Operation::GetRepeatedArgs("paddings"); float constant_value = Operation::GetOptionalArg( "constant_value", 0.0); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( type, paddings, constant_value); } else { diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index e68e8eb8d06b864b9c9173ada5fbb2312ec0566c..977305597ae742866d2c1d63c48f571cfaa884e7 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -45,8 +45,8 @@ void SimpleConstant() { // Run net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -58,8 +58,8 @@ void SimpleConstant() { // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto output = net.GetTensor("Output"); @@ -93,7 +93,8 @@ void Result(const std::vector &input_shape, if (D == DeviceType::CPU) { t_input = "TInput"; t_output = "TOutput"; - net.TransformDataFormat(input, NHWC, t_input, NCHW); + net.TransformDataFormat( + input, DataFormat::NHWC, t_input, DataFormat::NCHW); } OpDefBuilder("Pad", "PadTest") @@ -108,7 +109,8 @@ void Result(const std::vector &input_shape, net.RunOp(D); if (D == DeviceType::CPU) { - net.TransformDataFormat(t_output, NCHW, output, NHWC); + net.TransformDataFormat( + t_output, DataFormat::NCHW, output, DataFormat::NHWC); } auto actual = net.GetTensor(output.c_str()); @@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) { // Add input data net.AddRepeatedInput("Input", {1, 1, 1, 2}, 2); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) { // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto output = net.GetTensor("Output"); @@ -209,8 +211,8 @@ void Complex(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -222,8 +224,8 @@ void Complex(const std::vector &input_shape, // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 52842c5230a299ade8af2d85e24ba23f00052e30..ce726dcb3d6797d9020c1c1e2dfdddbad6069471 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -270,9 +270,9 @@ class PoolingOp : public PoolingOpBase { std::vector paddings(2); if (paddings_.empty()) { CalcPaddingAndOutputSize(input_tensor->shape().data(), - NHWC, + DataFormat::NHWC, filter_shape.data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -281,9 +281,9 @@ class PoolingOp : public PoolingOpBase { } else { paddings = paddings_; CalcOutputSize(input_tensor->shape().data(), - NHWC, + DataFormat::NHWC, filter_shape.data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -477,10 +477,9 @@ class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) : PoolingOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { - context->set_output_mem_type(MemoryType::GPU_BUFFER); kernel_ = make_unique>(); } } diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 104b67bc304de59a16d54bcdc6c66c68c987c0c7..037cf8cf76e1926f941a92ea5eb1197b11e74b99 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) { {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = @@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) { net.AddInputFromArray("Input", {1, 3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); @@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { "Input", {1, 4, 4, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); @@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { "Input", {1, 2, 9, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); @@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Run OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() { .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") @@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector &input_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) { {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor( @@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector &shape, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -563,7 +563,7 @@ void TestQuant(const index_t batch, net.AddRandomInput( "Input", input_shape, false, false); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddRandomInput( "OutputNCHW", input_shape, false, true, true); @@ -580,7 +580,7 @@ void TestQuant(const index_t batch, net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput") .Input("Input") diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index 29ce821b84a98f8552ce4d3e60a0f9d693f39f0d..27b34a91a32c214f22074e2f8605fdb29dd0d6f7 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include "mace/core/future.h" @@ -872,7 +873,7 @@ class ReduceOp : public ReduceOpBase { public: explicit ReduceOp(OpConstructContext *context) : ReduceOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(reduce_type_, axis_, keep_dims_); @@ -907,6 +908,34 @@ void RegisterReduce(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, DeviceType::GPU, half); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("Reduce") + .SetDevicePlacerFunc( + [](OpConditionContext *context) -> std::set { + auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } + bool keep_dims = + ProtoArgHelper::GetOptionalArg( + *op, "keepdims", false); + if (!keep_dims) { + return { DeviceType::CPU }; + } + auto axis = + ProtoArgHelper::GetRepeatedArgs( + *op, "axis"); + if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) { + return { DeviceType::CPU }; + } + auto tensor_shape_info = context->tensor_shape_info(); + if (tensor_shape_info->count(op->input(0)) == 0 + || tensor_shape_info->at(op->input(0)).size() != 4) { + return { DeviceType::CPU }; + } + return { DeviceType::CPU, DeviceType::GPU }; + })); } } // namespace ops diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc index ccf38fea25e08f6187d2875fdec363e9fa67ebe2..21a2dc13c3d63c8da97b47690b576d3d2499c6bf 100644 --- a/mace/ops/reduce_test.cc +++ b/mace/ops/reduce_test.cc @@ -38,7 +38,8 @@ void Simple(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Reduce", "ReduceTest") .Input("InputNCHW") .AddIntsArg("axis", axis) @@ -49,7 +50,8 @@ void Simple(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Reduce", "ReduceTest") .Input("Input") @@ -289,8 +291,8 @@ void RandomTest(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Reduce", "ReduceTest") .Input("InputNCHW") .AddIntsArg("axis", axis) @@ -301,8 +303,8 @@ void RandomTest(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Reduce", "ReduceTest") .Input("Input") .AddIntsArg("axis", axis) @@ -353,7 +355,7 @@ void TestQuant(const std::vector &input_shape, net.AddRandomInput( "Input", input_shape, false, false); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddRandomInput( "OutputNCHW", input_shape, false, true, true); @@ -368,7 +370,7 @@ void TestQuant(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput") .Input("Input") diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc index 6044af3b7fefa5e698bb6db02220832a8802af79..d06c6634548dfb079f615f01f9e394950a214059 100644 --- a/mace/ops/ref/deconv_2d.cc +++ b/mace/ops/ref/deconv_2d.cc @@ -51,7 +51,7 @@ MaceStatus Deconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc index 0da81faa60b5268d0effb3777669f9419483f77b..63b3aa6959ef343ef226a671614626f73578ea53 100644 --- a/mace/ops/ref/depthwise_deconv_2d.cc +++ b/mace/ops/ref/depthwise_deconv_2d.cc @@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); @@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index f06692b9711c87e04e710eaaa2c1bce39f44f38f..349f6423470b4db78df0f65e24b1dc1ae00bef58 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -212,7 +212,7 @@ class ResizeBicubicOp : public Operation { std::vector size = Operation::GetRepeatedArgs( "size", {-1, -1}); MACE_CHECK(size.size() == 2); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners, size[0], size[1]); } else { diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 035ddfcf8d9b0d80ea3cacdd07206848bc73cd5e..e9c5e4d10d35b19e6189889647aded2539e57809 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { std::vector input(48); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 4, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 3, 3}, @@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -133,8 +133,8 @@ void TestRandomResizeBicubic() { net.AddRandomInput("Input", {batch, in_height, in_width, channels}, false, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -144,8 +144,8 @@ void TestRandomResizeBicubic() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); Tensor expected; expected.Copy(*net.GetOutput("Output")); diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 1fe13f42b2ee20258fb55634746b85f492eea70e..09df62d880cad6a1f9ece73e5312a2b56df46340 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -346,7 +346,7 @@ class ResizeBilinearOp : public Operation { std::vector size = Operation::GetRepeatedArgs( "size", {-1, -1}); MACE_CHECK(size.size() == 2); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners, size[0], size[1]); } else { diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 9252e81fc56c2bd7932499646f3264a6872b1a22..c9c86427909517028bb7f495a02ccd466a690ab8 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -100,8 +100,8 @@ void TestRandomResizeBilinear() { // Add input data net.AddRandomInput("Input", {batch, in_height, in_width, channels}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -111,8 +111,8 @@ void TestRandomResizeBilinear() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() { true, -1.f, 1.f); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index 8840458f96f171ae0886b0181163b43c0093b02e..9e98e75e16313fc7d3093260feaa0207d40bcbd0 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -149,7 +149,7 @@ class ResizeNearestNeighborOp : public Operation { : Operation(context) { bool align_corners = Operation::GetOptionalArg( "align_corners", false); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners); } else { diff --git a/mace/ops/resize_nearest_neighbor_test.cc b/mace/ops/resize_nearest_neighbor_test.cc index b950047204a1dd8e3fb721622d7ce44635f08b0d..842c44c65ec63181e171191d1182008903aeed9f 100644 --- a/mace/ops/resize_nearest_neighbor_test.cc +++ b/mace/ops/resize_nearest_neighbor_test.cc @@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) { std::iota(begin(input), end(input), 0); std::vector size = {1, 2}; net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") @@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) { std::iota(begin(input), end(input), 0); std::vector size = {1, 2}; net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") @@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() { std::vector size = {20, 40}; net.AddRandomInput("Input", {batch, in_height, in_width, channels}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") .Input("InputNCHW") @@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc index 5d311cbc26af7d6cd66417ba9c5c1dea6cfa9f8c..07794065dbf678ccce6fe1c808240ce6508a4df7 100644 --- a/mace/ops/scalar_math.cc +++ b/mace/ops/scalar_math.cc @@ -100,11 +100,7 @@ class ScalarMathOp : public Operation { coeff_(Operation::GetRepeatedArgs("coeff")), scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( - "scalar_input_index", 1)) { - if (D == DeviceType::GPU) { - context->set_output_mem_type(MemoryType::GPU_BUFFER); - } - } + "scalar_input_index", 1)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 0eda5bf3ccee4973d9d9997ebdaac7fa5293ffa3..e32410989fe8c14cf936330769fd700eb0fe31b5 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -414,10 +414,9 @@ class SoftmaxOp : public Operation { : Operation(context) { bool use_log = ( Operation::GetOptionalArg("use_log", false)); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(use_log); } else { - context->set_output_mem_type(MemoryType::GPU_BUFFER); kernel_ = make_unique>(use_log); } } @@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { op_registry, OpConditionBuilder("Softmax") .SetDevicePlacerFunc( - [](OpConstructContext *context) -> std::set { + [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { return { DeviceType::CPU, DeviceType::GPU }; diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index ab818ac8d55b5c0b277c41fb0044797666ee4bce..eb3398db20217688e5d4d5aa42c6588c03fb0745 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -50,7 +50,8 @@ void Simple(bool use_log = false) { if (D == DeviceType::CPU) { // test 4d softmax - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -59,7 +60,8 @@ void Simple(bool use_log = false) { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -109,7 +111,8 @@ void Complex(const std::vector &logits_shape, net.AddRandomInput("Input", logits_shape); if (logits_shape.size() == 4) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") @@ -127,7 +130,8 @@ void Complex(const std::vector &logits_shape, net.RunOp(); if (logits_shape.size() == 4) { - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(); diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index b239193c2641af400fb5c67f25be2efff8c04859..50de3fc74b1104ccac8576e29a90911789dc91fd 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -307,7 +307,7 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { public: explicit SpaceToBatchNDOp(OpConstructContext *context) : SpaceToBatchOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 95b9fafc7e7fbdef97b9ab379b7aad8175ddbd51..045d6eceba3afc98e2b242d820637d1de04789fe 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); } else if (D == CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check ExpectTensorNear(*expected, *net.GetOutput("Output")); @@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); } else if (D == CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check ExpectTensorNear(*expected, *net.GetOutput("Output")); @@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector &input_shape, net.RunOp(GPU); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // Check ExpectTensorNear(*net.GetOutput("OutputCPU"), @@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, net.RunOp(GPU); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // Check ExpectTensorNear(*net.GetOutput("OutputCPU"), @@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector &input_shape, 1.f); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") @@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector &input_shape, 1.f); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 918ae678b5cb09c2f6c8f2a584f3b5fbb5d47997..9584ddb8d7d43f3cea7c5b0612e7bca24346070d 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -94,7 +94,7 @@ class SpaceToDepthOp : public Operation { explicit SpaceToDepthOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc index 23daaa55d3604f9e629e67a5b01acb0019926a2c..6d023b88c9873d5e0d9b63cf54eebf1695594209 100644 --- a/mace/ops/space_to_depth_test.cc +++ b/mace/ops/space_to_depth_test.cc @@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); // Construct graph if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") @@ -107,8 +107,8 @@ void RandomTest(const int block_size, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("InputNCHW") .AddIntArg("block_size", block_size) @@ -118,8 +118,8 @@ void RandomTest(const int block_size, // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("Input") diff --git a/mace/ops/split.cc b/mace/ops/split.cc index e1523a06253c2a38c2451046e4daa1b0c51d2713..b08d72c533d480a65cbff0c6fefb6a3b940322d6 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -106,7 +106,7 @@ class SplitOp : public Operation { explicit SplitOp(OpConstructContext *context) : Operation(context) { int32_t axis = Operation::GetOptionalArg("axis", 3); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(axis); } else { MACE_NOT_IMPLEMENTED; @@ -144,7 +144,7 @@ void RegisterSplit(OpRegistryBase *op_registry) { op_registry, OpConditionBuilder("Split") .SetDevicePlacerFunc( - [](OpConstructContext *context) -> std::set { + [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { return {DeviceType::CPU, DeviceType::GPU}; diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index d58191c4d0bd6b2d992af9495c56b1a7dca4bc44..cd2fb1742f4a31992922deb357f4cfa788c032f8 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -83,7 +83,7 @@ class SqrDiffMeanOp : public Operation { public: explicit SqrDiffMeanOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc index 342574792222bf4de691038a757feca926913663..3257987c7b9d8dc65a218059cd5c44ae9ab2e55d 100644 --- a/mace/ops/sqrdiff_mean_test.cc +++ b/mace/ops/sqrdiff_mean_test.cc @@ -36,13 +36,13 @@ void Simple(const std::vector &input_shape0, net.AddInputFromArray("Input1", input_shape1, input1); net.TransformDataFormat("Input0", - NHWC, + DataFormat::NHWC, "InputNCHW0", - NCHW); + DataFormat::NCHW); net.TransformDataFormat("Input1", - NHWC, + DataFormat::NHWC, "InputNCHW1", - NCHW); + DataFormat::NCHW); if (D == DeviceType::CPU) { OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") @@ -54,9 +54,9 @@ void Simple(const std::vector &input_shape0, net.RunOp(D); net.TransformDataFormat("OutputNCHW", - NCHW, + DataFormat::NCHW, "Output", - NHWC); + DataFormat::NHWC); } else { OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("Input0") @@ -107,10 +107,10 @@ void RandomTest(const std::vector &input_shape0, net.AddRandomInput("Input0", input_shape0); net.AddRandomInput("Input1", input_shape1); - net.TransformDataFormat("Input0", NHWC, "InputNCHW0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "InputNCHW1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "InputNCHW0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "InputNCHW1", DataFormat::NCHW); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("InputNCHW0") .Input("InputNCHW1") @@ -118,8 +118,8 @@ void RandomTest(const std::vector &input_shape0, .Finalize(net.NewOperatorDef()); // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("Input0") .Input("Input1") diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 15c3408c2bbbfbc6832af699045036d1580152c7..660a8e8f3dbfd8b54e701b5ff7714dc0c942aa3f 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -77,7 +77,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { op_registry, OpConditionBuilder("Squeeze") .SetDevicePlacerFunc( - [](OpConstructContext *context) -> std::set { + [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { return { DeviceType::CPU, DeviceType::GPU }; diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc index 8b085fe532694f7c343e0cfda735d91332aea294..f8dd06f551a26c023093f9a73d83d55fed87ddd7 100644 --- a/mace/ops/strided_slice_test.cc +++ b/mace/ops/strided_slice_test.cc @@ -86,8 +86,8 @@ void TestStridedSliceWithDataFormat(const std::vector &input_shape, net.AddInputFromArray( "Strides", {static_cast(strides.size())}, strides); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("StridedSlice", "StridedSliceOpTest") .Input("InputNCHW") @@ -105,8 +105,8 @@ void TestStridedSliceWithDataFormat(const std::vector &input_shape, net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", output_shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), *net.GetOutput("Output")); @@ -154,8 +154,8 @@ void TestSliceWithDataFormat(const std::vector &input_shape, net.AddInputFromArray( "IndicesSize", {static_cast(indices_size.size())}, indices_size); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("StridedSlice", "StridedSliceOpTest") .Input("InputNCHW") @@ -168,8 +168,8 @@ void TestSliceWithDataFormat(const std::vector &input_shape, net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", output_shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), *net.GetOutput("Output")); diff --git a/mace/public/mace.h b/mace/public/mace.h index fd39fdba6c501b6f1aa4eb6cb7980fa5158012ca..72e96d1e38a0438a3f1df8c5e4725b6d7f69d8a7 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -34,9 +34,10 @@ class NetDef; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 }; -enum DataFormat { - DF_NONE = 0, NHWC = 1, NCHW = 2, - HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 +enum class DataFormat { + NONE = 0, NHWC = 1, NCHW = 2, + HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103, + AUTO = 1000, }; enum GPUPerfHint { diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 446321a447703414ba00e51d74745c5df635ee69..58658dd81d90b7b9110706338ae7328214ada19b 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -41,7 +41,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value, 'cpu+gpu': cvt.DeviceType.CPU.value} data_format_map = { - 'NONE': cvt.DataFormat.DF_NONE, + 'NONE': cvt.DataFormat.NONE, 'NHWC': cvt.DataFormat.NHWC, 'NCHW': cvt.DataFormat.NCHW, 'OIHW': cvt.DataFormat.OIHW, diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 80da9b1d50a23152cd48b88a019801bfea40ad2c..61e65bae9152ed3337306addd84e6e29c2d9bc57 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -26,13 +26,14 @@ class DeviceType(Enum): class DataFormat(Enum): - DF_NONE = 0 + NONE = 0 NHWC = 1 NCHW = 2 HWIO = 100 OIHW = 101 HWOI = 102 OHWI = 103 + AUTO = 1000 # SAME_LOWER: if the amount of paddings to be added is odd, @@ -161,13 +162,39 @@ MaceSupportedOps = [ 'SumGroup', 'TargetRMSNorm', 'Transpose', - 'WinogradInverseTransform', - 'WinogradTransform', 'Cumsum', ] MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str) +MaceHasDataFormatOps = [MaceOp.BatchNorm, + MaceOp.BatchToSpaceND, + MaceOp.Conv2D, + MaceOp.Deconv2D, + MaceOp.DepthToSpace, + MaceOp.DepthwiseConv2d, + MaceOp.DepthwiseDeconv2d, + MaceOp.FullyConnected, + MaceOp.Pooling, + MaceOp.ResizeBicubic, + MaceOp.ResizeBilinear, + MaceOp.ResizeNearestNeighbor, + MaceOp.SpaceToBatchND, + MaceOp.SpaceToDepth] + +MaceMayHasDataFormatOps = [MaceOp.Activation, + MaceOp.AddN, + MaceOp.BiasAdd, + MaceOp.ChannelShuffle, + MaceOp.Concat, + MaceOp.Crop, + MaceOp.Eltwise, + MaceOp.Pad, + MaceOp.Reduce, + MaceOp.Softmax, + MaceOp.Split, + MaceOp.SqrDiffMean] + class MaceKeyword(object): # node related str @@ -505,12 +532,11 @@ class ConverterOption(object): TransformerRule.TRANSFORM_CHANNEL_SHUFFLE, # Model data format related transformation TransformerRule.TRANSPOSE_FILTERS, - TransformerRule.TRANSPOSE_DATA_FORMAT, + # Mace model structure related transformation + TransformerRule.ADD_IN_OUT_TENSOR_INFO, TransformerRule.TRANSPOSE_MATMUL_WEIGHT, # Add winograd argument TransformerRule.ADD_WINOGRAD_ARG, - # Mace model structure related transformation - TransformerRule.ADD_IN_OUT_TENSOR_INFO, # Data type related transformation TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE, # Transform finalization @@ -519,6 +545,7 @@ class ConverterOption(object): TransformerRule.SORT_BY_EXECUTION, # update the data format of ops TransformerRule.UPDATE_DATA_FORMAT, + TransformerRule.TRANSPOSE_DATA_FORMAT, # Need to be put after SORT_BY_EXECUTION TransformerRule.ADD_QUANTIZE_TENSOR_RANGE, ] @@ -571,6 +598,8 @@ class ConverterUtil(object): return DataFormat.NHWC elif arg.i == DataFormat.NCHW.value: return DataFormat.NCHW + elif arg.i == DataFormat.AUTO.value: + return DataFormat.AUTO else: return None diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py index c5b6176824d28dcf67a4dd68defdebdfecafcbed..b65a10f41e5d52a79d8386df9b2938230506e9cd 100644 --- a/mace/python/tools/converter_tool/caffe_converter.py +++ b/mace/python/tools/converter_tool/caffe_converter.py @@ -195,6 +195,7 @@ class CaffeConverter(base_converter.ConverterInterface): self._option = option self._mace_net_def = mace_pb2.NetDef() ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) + ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NCHW) self._caffe_net = CaffeNet() self._caffe_layers = caffe_pb2.NetParameter() caffe_weights = caffe_pb2.NetParameter() diff --git a/mace/python/tools/converter_tool/onnx_converter.py b/mace/python/tools/converter_tool/onnx_converter.py index 54d53db0081d7c94f83b2978f331196d39183883..70e855d5b693c199a42a7f0df5b8a8f28441907d 100644 --- a/mace/python/tools/converter_tool/onnx_converter.py +++ b/mace/python/tools/converter_tool/onnx_converter.py @@ -387,6 +387,8 @@ class OnnxConverter(base_converter.ConverterInterface): self._mace_net_def = mace_pb2.NetDef() self._data_format = DataFormat.NCHW ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) + ConverterUtil.add_data_format_arg(self._mace_net_def, + self._data_format) onnx_model = onnx.load(src_model_file) ir_version = onnx_model.ir_version @@ -402,7 +404,7 @@ class OnnxConverter(base_converter.ConverterInterface): print("constains ops domain: ", domain, "version:", version) if 'kaldi2onnx' in domain: polish_available = False - self._data_format = DataFormat.DF_NONE + self._data_format = DataFormat.NONE self._isKaldi = True if polish_available: onnx_model = onnx.utils.polish_model(onnx_model) diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index 581801521ed185e7c90ca420d17d01775a369ee9..66fef5cb9cda43074724e4542611b4e38bab1795 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -270,6 +270,7 @@ class TensorflowConverter(base_converter.ConverterInterface): self._option = option self._mace_net_def = mace_pb2.NetDef() ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO) + ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NHWC) # import tensorflow graph tf_graph_def = tf.GraphDef() diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index faf33034e292233372dd367e71a2bae67ddf0887..51806961d045e40a9cc9de184238b41b5d953308 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -27,6 +27,8 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp +from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps # noqa +from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps # noqa from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import ReduceType from mace.python.tools.converter_tool.base_converter import TransformerRule @@ -77,10 +79,9 @@ class Transformer(base_converter.ConverterInterface): self.transpose_matmul_weight, TransformerRule.FOLD_FC_RESHAPE: self.fold_fc_reshape, - TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format, - TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg, TransformerRule.ADD_IN_OUT_TENSOR_INFO: self.add_in_out_tensor_info, + TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg, TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: self.transform_global_conv_to_fc, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, @@ -96,6 +97,7 @@ class Transformer(base_converter.ConverterInterface): self.add_opencl_informations, TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution, TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format, + TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format, TransformerRule.CHECK_QUANTIZE_INFO: self.check_quantize_info, TransformerRule.TRANSPOSE_CAFFE_RESHAPE_AND_FLATTEN: @@ -194,21 +196,19 @@ class Transformer(base_converter.ConverterInterface): op.type = "Input" data_type_arg = op.arg.add() data_type_arg.name = MaceKeyword.mace_op_data_type_str - data_type_arg.i = mace_pb2.DT_FLOAT + data_type_arg.i = input_node.data_type op.output.extend([input_node.name]) output_shape = op.output_shape.add() output_shape.dims.extend(input_node.shape) - if input_node.name in self._consumers: - if ConverterUtil.data_format( - self._consumers[input_node.name][0]) \ - == DataFormat.NCHW: + if input_node.data_format != DataFormat.NONE: + if input_node.data_format == DataFormat.NCHW: self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) - ConverterUtil.add_data_format_arg(op, - DataFormat.NCHW) - else: - ConverterUtil.add_data_format_arg(op, - DataFormat.NHWC) + ConverterUtil.add_data_format_arg(op, + DataFormat.AUTO) + else: + ConverterUtil.add_data_format_arg(op, + DataFormat.NONE) self._producer[op.output[0]] = op @staticmethod @@ -256,6 +256,13 @@ class Transformer(base_converter.ConverterInterface): else: return None + def get_tensor_data_format(self, tensor): + if tensor in self._producer: + producer = self._producer[tensor] + return ConverterUtil.data_format(producer) + else: + return DataFormat.NONE + def consumer_count(self, tensor_name): return len(self._consumers.get(tensor_name, [])) @@ -838,8 +845,6 @@ class Transformer(base_converter.ConverterInterface): or op.type == MaceOp.DepthwiseConv2d.name or op.type == MaceOp.FullyConnected.name) and len(op.input) == 2) - or (op.type == MaceOp.WinogradInverseTransform.name - and len(op.input) == 1) or (op.type == MaceOp.Deconv2D.name and ((ConverterUtil.get_arg( op, @@ -930,8 +935,7 @@ class Transformer(base_converter.ConverterInterface): or op.type == MaceOp.Deconv2D.name or op.type == MaceOp.DepthwiseConv2d.name or op.type == MaceOp.FullyConnected.name - or op.type == MaceOp.BatchNorm.name - or op.type == MaceOp.WinogradInverseTransform.name) \ + or op.type == MaceOp.BatchNorm.name) \ and len(self._consumers.get(op.output[0], [])) == 1: consumer_op = self._consumers[op.output[0]][0] if consumer_op.type == MaceOp.Activation.name \ @@ -1017,97 +1021,6 @@ class Transformer(base_converter.ConverterInterface): filter_format.name) return False - def transpose_data_format(self): - net = self._model - - for op in net.op: - # transpose args - if op.type == MaceOp.Pad.name: - for arg in op.arg: - if arg.name == MaceKeyword.mace_paddings_str: - mace_check(len(arg.ints) == 8, - "pad dim rank should be 8.") - if ConverterUtil.data_format(op) == DataFormat.NCHW: - print("Transpose pad args: %s(%s)" - % (op.name, op.type)) - self.transpose_shape(arg.ints, - [0, 1, 4, 5, 6, 7, 2, 3]) - elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name: - for arg in op.arg: - if arg.name == MaceKeyword.mace_axis_str: - if (ConverterUtil.data_format(op) == DataFormat.NCHW - and len(op.output_shape[0].dims) == 4): - print("Transpose concat/split args: %s(%s)" - % (op.name, op.type)) - if arg.i == 1: - arg.i = 3 - elif arg.i == 2: - arg.i = 1 - elif arg.i == 3: - arg.i = 2 - - producer = self._producer[op.input[0]] - input_shape = producer.output_shape[0].dims - if producer.type == MaceOp.FullyConnected.name and \ - len(input_shape) == 2: - axis_arg = ConverterUtil.get_arg( - op, MaceKeyword.mace_axis_str) - if axis_arg.i == 1: - axis_arg.i = 3 - - elif op.type == MaceOp.Squeeze.name: - for arg in op.arg: - if arg.name == MaceKeyword.mace_axis_str: - if ConverterUtil.data_format(op) == DataFormat.NCHW: - print("Transpose squeeze args: %s(%s)" - % (op.name, op.type)) - mace_check(list(arg.ints) == [2, 3], - 'only support squeeze at at [2, 3]') - arg.ints[:] = [1, 2] - - elif op.type == MaceOp.Reduce.name: - for arg in op.arg: - if arg.name == MaceKeyword.mace_axis_str: - if ConverterUtil.data_format( - op) == DataFormat.NCHW: - print("Transpose reduce args: %s(%s)" - % (op.name, op.type)) - reduce_axises = list(arg.ints) - new_axises = [] - for i in range(len(reduce_axises)): - idx = reduce_axises[i] - if idx == 2 or idx == 3: - new_axises.append(idx - 1) - elif idx == 1: - new_axises.append(3) - else: - new_axises.append(idx) - new_axises.sort() - arg.ints[:] = [] - arg.ints.extend(new_axises) - elif op.type == MaceOp.Crop.name: - offset_arg = ConverterUtil.get_arg(op, - MaceKeyword.mace_offset_str) - mace_check(offset_arg and - ConverterUtil.data_format(op) == DataFormat.NCHW and - len(op.output_shape[0].dims) == 4, - "MACE only support crop with NCHW format") - print("Transpose crop args: %s(%s)" - % (op.name, op.type)) - self.transpose_shape(offset_arg.ints, [0, 2, 3, 1]) - - # transpose op output shape - data_format = ConverterUtil.data_format(op) - if data_format is not None \ - and data_format != DataFormat.NHWC: - print("Transpose output shapes: %s(%s)" % (op.name, op.type)) - for output_shape in op.output_shape: - if len(output_shape.dims) == 4: - self.transpose_shape(output_shape.dims, - [0, 2, 3, 1]) - - return False - def add_winograd_arg(self): if self._wino_arg == 0: return False @@ -1428,17 +1341,122 @@ class Transformer(base_converter.ConverterInterface): def update_data_format(self): print("update data format") - data_format_flag = 1 - for input_node in self._option.input_nodes.values(): - if input_node.data_format.value == DataFormat.DF_NONE.value: - data_format_flag = 0 net = self._model for op in net.op: - ConverterUtil.del_arg( + df_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_data_format_str) - has_data_format_arg = op.arg.add() - has_data_format_arg.name = MaceKeyword.mace_has_data_format_str - has_data_format_arg.i = data_format_flag + if not df_arg: + df_arg = op.arg.add() + df_arg.name = MaceKeyword.mace_data_format_str + if op.type in MaceHasDataFormatOps: + df_arg.i = DataFormat.AUTO.value + elif op.type in MaceMayHasDataFormatOps: + input_df = DataFormat.AUTO.value + for input_tensor in op.input: + if input_tensor in self._consts: + continue + mace_check( + input_tensor in self._producer, + "Input tensor %s not in producer" % input_tensor) + father_op = self._producer[input_tensor] + temp_input_df = ConverterUtil.get_arg( + father_op, MaceKeyword.mace_data_format_str) + if temp_input_df.i != DataFormat.AUTO.value: + input_df = temp_input_df.i + if input_df == DataFormat.AUTO.value: + df_arg.i = input_df + # add flag to mark the ops may has data format + has_data_format_arg = op.arg.add() + has_data_format_arg.name = \ + MaceKeyword.mace_has_data_format_str + has_data_format_arg.i = 1 + return False + + def transpose_data_format(self): + print("Transpose arguments based on data format") + net = self._model + + src_data_format = ConverterUtil.data_format(net) + for op in net.op: + has_data_format = ConverterUtil.data_format(op) == \ + DataFormat.AUTO + # transpose args + if op.type == MaceOp.Pad.name: + for arg in op.arg: + if arg.name == MaceKeyword.mace_paddings_str: + mace_check(len(arg.ints) == 8, + "pad dim rank should be 8.") + if src_data_format == DataFormat.NCHW and \ + has_data_format: + print("Transpose pad args: %s(%s)" + % (op.name, op.type)) + self.transpose_shape(arg.ints, + [0, 1, 4, 5, 6, 7, 2, 3]) + elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name: + for arg in op.arg: + if arg.name == MaceKeyword.mace_axis_str: + if (src_data_format == DataFormat.NCHW + and has_data_format + and len(op.output_shape[0].dims) == 4): + print("Transpose concat/split args: %s(%s)" + % (op.name, op.type)) + if arg.i == 1: + arg.i = 3 + elif arg.i == 2: + arg.i = 1 + elif arg.i == 3: + arg.i = 2 + + producer = self._producer[op.input[0]] + input_shape = producer.output_shape[0].dims + if producer.type == MaceOp.FullyConnected.name and \ + len(input_shape) == 2: + axis_arg = ConverterUtil.get_arg( + op, MaceKeyword.mace_axis_str) + if axis_arg.i == 1: + axis_arg.i = 3 + + elif op.type == MaceOp.Reduce.name: + for arg in op.arg: + if arg.name == MaceKeyword.mace_axis_str: + if src_data_format == DataFormat.NCHW and \ + has_data_format: + print("Transpose reduce args: %s(%s)" + % (op.name, op.type)) + reduce_axises = list(arg.ints) + new_axises = [] + for i in range(len(reduce_axises)): + idx = reduce_axises[i] + if idx == 2 or idx == 3: + new_axises.append(idx - 1) + elif idx == 1: + new_axises.append(3) + else: + new_axises.append(idx) + new_axises.sort() + arg.ints[:] = [] + arg.ints.extend(new_axises) + elif op.type == MaceOp.Crop.name: + offset_arg = ConverterUtil.get_arg(op, + MaceKeyword.mace_offset_str) + mace_check(offset_arg and + src_data_format == DataFormat.NCHW + and has_data_format + and len(op.output_shape[0].dims) == 4, + "MACE only support crop with NCHW format") + print("Transpose crop args: %s(%s)" + % (op.name, op.type)) + self.transpose_shape(offset_arg.ints, [0, 2, 3, 1]) + + # transpose op output shape + if src_data_format == DataFormat.NCHW and \ + has_data_format: + print("Transpose output shapes: %s(%s)" % (op.name, op.type)) + for output_shape in op.output_shape: + if len(output_shape.dims) == 4: + self.transpose_shape(output_shape.dims, + [0, 2, 3, 1]) + return False def quantize_nodes(self): @@ -1493,7 +1511,7 @@ class Transformer(base_converter.ConverterInterface): self._model.input_info[i].zero_point = quantize_info.zero_point ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) - ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) + ConverterUtil.add_data_format_arg(op_def, input_node.data_format) # use actual ranges for model input quantize find_range_every_time_arg = op_def.arg.add() find_range_every_time_arg.name = \ @@ -1516,6 +1534,7 @@ class Transformer(base_converter.ConverterInterface): self._model.output_info[i].zero_point = quantize_info.zero_point ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) + ConverterUtil.add_data_format_arg(op_def, output_node.data_format) quantize_flag_arg = self._model.arg.add() quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str @@ -1886,9 +1905,6 @@ class Transformer(base_converter.ConverterInterface): shape_tensor.data_type = mace_pb2.DT_INT32 else: mace_check(False, "Only support reshape and flatten") - # NCHW -> NHWC - if len(dims) == 4: - self.transpose_shape(dims, [0, 2, 3, 1]) shape_tensor.int32_data.extend(dims) op.input.append(shape_tensor.name) @@ -2030,6 +2046,9 @@ class Transformer(base_converter.ConverterInterface): data_type_arg = quantize_op.arg.add() data_type_arg.name = MaceKeyword.mace_op_data_type_str data_type_arg.i = mace_pb2.DT_UINT8 + ConverterUtil.add_data_format_arg( + quantize_op, + self.get_tensor_data_format(input_tensor)) data_type_arg = quantize_op.arg.add() data_type_arg.name = MaceKeyword.mace_non_zero @@ -2050,8 +2069,8 @@ class Transformer(base_converter.ConverterInterface): del op.input[:] op.input.extend(quantized_inputs_names) - orginal_output_name = op.output[0] - op.output[0] = orginal_output_name + "_quant" + original_output_name = op.output[0] + op.output[0] = original_output_name + "_quant" op.output_type.extend([to_quantize_ops_output_type[op.type]]) data_type_arg = ConverterUtil.get_arg(op, MaceKeyword.mace_op_data_type_str) # noqa @@ -2064,13 +2083,15 @@ class Transformer(base_converter.ConverterInterface): dequantize_op.name = op.name + "_dequant" dequantize_op.type = MaceOp.Dequantize.name dequantize_op.input.extend([op.output[0]]) - dequantize_op.output.extend([orginal_output_name]) + dequantize_op.output.extend([original_output_name]) dequantize_op.output_shape.extend(op.output_shape) dequantize_op.output_type.extend([mace_pb2.DT_FLOAT]) data_type_arg = dequantize_op.arg.add() data_type_arg.name = MaceKeyword.mace_op_data_type_str data_type_arg.i = to_quantize_ops_output_type[op.type] - + ConverterUtil.add_data_format_arg( + dequantize_op, + self.get_tensor_data_format(original_output_name)) quantize_flag_arg = ConverterUtil.get_arg(self._model, MaceKeyword.mace_quantize_flag_arg_str) # noqa if quantize_flag_arg is None: diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 89bee8d8f9dba8ce27ff97ff016381eb7b9da5e7..0d1396c498988ac39f2d1509c8eff90c2deeccab 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -80,7 +80,7 @@ void CreateInputInfo(NetDef *net_def) { input_info = net_def->add_input_info(); input_info->set_name({{ net.input_info[idx].name|tojson }}); input_info->set_data_type(static_cast({{ net.input_info[idx].data_type }})); - input_info->set_data_format(static_cast({{ net.input_info[idx].data_format }})); + input_info->set_data_format({{ net.input_info[idx].data_format }}); input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }}); {% for dim in net.input_info[idx].dims %} input_info->add_dims({{ dim }}); @@ -97,7 +97,7 @@ void CreateOutputInfo(NetDef *net_def) { output_info = net_def->add_output_info(); output_info->set_name({{ net.output_info[idx].name|tojson }}); output_info->set_data_type(static_cast({{ net.output_info[idx].data_type }})); - output_info->set_data_format(static_cast({{ net.output_info[idx].data_format }})); + output_info->set_data_format({{ net.output_info[idx].data_format }}); output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }}); {% for dim in net.output_info[idx].dims %} output_info->add_dims({{dim}}); diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 4bf5f40bdd7300c6aa7f3ff2965e0b8be47a07a0..a06ce49347ea117d501c2d1273291be802b3dd69 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -48,7 +48,7 @@ void MaceRunFunc(const int in_out_size) { for (size_t i = 0; i < input_names.size(); ++i) { InputOutputInfo *info = net_def->add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names[i]); for (auto d : input_shapes[0]) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 0a852a17a9a9cfd6a7d331556b1ad1b1a85e397a..6cad55b91464937586398f77f7e0694011d6cbda 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -45,7 +45,7 @@ void MaceRun(const int in_out_size, for (size_t i = 0; i < input_names.size(); ++i) { InputOutputInfo *info = net_def->add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names[i]); for (auto d : max_shape) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h index 9cc1402f7558c9e5d0d1116eaef2fb161adda194..faaf144347f0020f39e6de3c9d50d7b553b03b17 100644 --- a/mace/test/mace_api_test.h +++ b/mace/test/mace_api_test.h @@ -76,7 +76,7 @@ void Conv3x3(const std::string &input_name, .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("has_data_format", 1) + .AddIntArg("data_format", static_cast(DataFormat::AUTO)) .Finalize(&operator_def); OutputShape *shape = operator_def.add_output_shape(); @@ -99,7 +99,7 @@ void Relu(const std::string &input_name, .AddStringArg("activation", "RELU") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddIntArg("device", static_cast(device_type)) - .AddIntArg("has_data_format", 1) + .AddIntArg("data_format", static_cast(DataFormat::AUTO)) .Finalize(&operator_def); net_def->add_op()->CopyFrom(operator_def); @@ -139,7 +139,8 @@ void CheckOutputs(const NetDef &net_def, if (D == DeviceType::CPU) { std::string input_name = input.first + "NHWC"; net.AddInputFromArray(input_name, input_shape, input_data); - net.TransformDataFormat(input_name, NHWC, input.first, NCHW); + net.TransformDataFormat( + input_name, DataFormat::NHWC, input.first, DataFormat::NCHW); } else { net.AddInputFromArray(input.first, input_shape, input_data); } @@ -154,7 +155,7 @@ void CheckOutputs(const NetDef &net_def, memcpy(data.data(), reinterpret_cast(tensor_data.data()) + tensor.offset(), tensor.data_size() * sizeof(T)); - net.AddInputFromArray(tensor.name(), shape, data); + net.AddInputFromArray(tensor.name(), shape, data, true); } net.RunNet(net_def, D); @@ -175,9 +176,9 @@ void CheckOutputs(const NetDef &net_def, if (D == DeviceType::CPU) { output_name = output.first + "NHWC"; net.TransformDataFormat(output.first, - NCHW, + DataFormat::NCHW, output_name, - NHWC); + DataFormat::NHWC); } ops::test::ExpectTensorNear(*tmp_tensor, *net.GetOutput(output_name.data()), diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 7fc0690df25c3f2dc094cc4f36109b3eba392e23..fca4a0fd42958110130e6317274b32a600106ab3 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -91,7 +91,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } }