From 95b32c2445f5ec9d02221d38c3ad68c0dd3c1857 Mon Sep 17 00:00:00 2001 From: liuqi Date: Mon, 22 Apr 2019 09:49:48 +0800 Subject: [PATCH] Update the memory type choose logic and polish some code. 1. Change DataFormat from enum to enum class. --- mace/benchmark/benchmark_model.cc | 2 +- mace/core/arg_helper.cc | 5 +- mace/core/arg_helper.h | 2 +- mace/core/memory_optimizer.cc | 3 +- mace/core/net.cc | 2 +- mace/core/net_def_adapter.cc | 165 +++++++++--------- mace/core/net_def_adapter.h | 24 ++- mace/core/net_optimizer.cc | 6 +- mace/core/net_optimizer.h | 13 ++ mace/core/operator.cc | 55 ++++-- mace/core/operator.h | 14 ++ mace/core/runtime/opencl/opencl_util.cc | 2 +- mace/core/workspace.cc | 2 +- mace/examples/cli/example.cc | 2 +- mace/libmace/capability.cc | 6 +- mace/libmace/mace.cc | 38 ++-- mace/ops/activation.cc | 6 +- mace/ops/activation_test.cc | 6 +- mace/ops/addn.cc | 6 +- mace/ops/arm/fp32/deconv_2d.cc | 2 +- mace/ops/batch_norm.cc | 2 +- mace/ops/batch_norm_test.cc | 38 ++-- mace/ops/batch_to_space.cc | 2 +- mace/ops/bias_add.cc | 6 +- mace/ops/bias_add_benchmark.cc | 2 - mace/ops/bias_add_test.cc | 24 +-- mace/ops/buffer_to_image_benchmark.cc | 1 - mace/ops/buffer_to_image_test.cc | 12 +- mace/ops/buffer_transform.cc | 5 +- mace/ops/buffer_transform_test.cc | 6 +- mace/ops/channel_shuffle.cc | 4 +- mace/ops/channel_shuffle_test.cc | 8 +- mace/ops/common/conv_pool_2d_util.cc | 76 ++++---- mace/ops/concat.cc | 8 +- mace/ops/conv_2d.cc | 10 +- mace/ops/conv_2d_test.cc | 108 ++++++------ mace/ops/crop.cc | 6 +- mace/ops/crop_test.cc | 12 +- mace/ops/cumsum_test.cc | 8 +- mace/ops/deconv_2d.cc | 6 +- mace/ops/deconv_2d_test.cc | 19 +- mace/ops/depth_to_space.cc | 2 +- mace/ops/depth_to_space_test.cc | 16 +- mace/ops/depthwise_conv2d.cc | 22 ++- mace/ops/depthwise_conv2d_test.cc | 30 ++-- mace/ops/depthwise_deconv2d.cc | 4 +- mace/ops/depthwise_deconv2d_test.cc | 19 +- mace/ops/eltwise.cc | 2 +- mace/ops/eltwise_test.cc | 50 +++--- mace/ops/folded_batch_norm_test.cc | 42 ++--- mace/ops/fully_connected.cc | 2 +- mace/ops/fully_connected_test.cc | 19 +- mace/ops/local_response_norm_test.cc | 6 +- mace/ops/lstm_cell.cc | 2 +- mace/ops/opencl/buffer_transformer.h | 5 +- mace/ops/ops_test_util.cc | 45 +++-- mace/ops/ops_test_util.h | 16 +- mace/ops/pad.cc | 2 +- mace/ops/pad_test.cc | 30 ++-- mace/ops/pooling.cc | 10 +- mace/ops/pooling_test.cc | 68 ++++---- mace/ops/reduce.cc | 7 +- mace/ops/reduce_test.cc | 18 +- mace/ops/ref/deconv_2d.cc | 2 +- mace/ops/ref/depthwise_deconv_2d.cc | 4 +- mace/ops/resize_bicubic.cc | 2 +- mace/ops/resize_bicubic_test.cc | 32 ++-- mace/ops/resize_bilinear.cc | 2 +- mace/ops/resize_bilinear_test.cc | 32 ++-- mace/ops/resize_nearest_neighbor.cc | 2 +- mace/ops/resize_nearest_neighbor_test.cc | 24 +-- mace/ops/softmax.cc | 2 +- mace/ops/softmax_test.cc | 12 +- mace/ops/space_to_batch.cc | 2 +- mace/ops/space_to_batch_test.cc | 48 ++--- mace/ops/space_to_depth.cc | 2 +- mace/ops/space_to_depth_test.cc | 16 +- mace/ops/split.cc | 4 +- mace/ops/sqrdiff_mean.cc | 2 +- mace/ops/sqrdiff_mean_test.cc | 24 +-- mace/ops/strided_slice_test.cc | 16 +- mace/public/mace.h | 6 +- mace/python/tools/converter.py | 2 +- .../tools/converter_tool/base_converter.py | 8 +- .../tools/converter_tool/onnx_converter.py | 5 +- .../tools/converter_tool/transformer.py | 26 +-- mace/python/tools/model.jinja2 | 4 +- mace/test/mace_api_mt_test.cc | 2 +- mace/test/mace_api_test.cc | 2 +- mace/test/mace_api_test.h | 13 +- mace/tools/validation/mace_run.cc | 2 +- 91 files changed, 791 insertions(+), 648 deletions(-) diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index e0dac730..98807b67 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } } diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index f2a6467b..2cb1379b 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -123,14 +123,13 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f) \ MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i) \ MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i) \ - MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i) \ - MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, std::string, s) + MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i) MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef) MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef) #undef MACE_SET_OPTIONAL_ARGUMENT_FUNC -std::string OutputMemoryTypeTagName() { +const std::string OutputMemoryTypeTagName() { static const char *kOutputMemTypeArgName = "output_mem_type"; return kOutputMemTypeArgName; } diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h index 5512fb06..e3a6319a 100644 --- a/mace/core/arg_helper.h +++ b/mace/core/arg_helper.h @@ -65,7 +65,7 @@ void SetProtoArg(NetDef *op_def, const std::string &arg_name, const T&value); -std::string OutputMemoryTypeTagName(); +const std::string OutputMemoryTypeTagName(); bool IsQuantizedModel(const NetDef &def); diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 9b572071..b781682f 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -126,7 +126,8 @@ void MemoryOptimizer::Optimize( DataFormat data_format = static_cast( ProtoArgHelper::GetOptionalArg( - *op_def, "data_format", DataFormat::DF_NONE)); + *op_def, "data_format", + static_cast(DataFormat::NONE))); int output_size = op_def->output_size(); for (int i = 0; i < output_size; ++i) { if (i < op_def->output_type_size()) { diff --git a/mace/core/net.cc b/mace/core/net.cc index c6e676d2..8c301dc7 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -76,7 +76,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, #ifdef MACE_ENABLE_OPENCL if (target_device_->device_type() == DeviceType::GPU) { - // update the map : output_tensor -> Operation + // update the map : output_tensor -> MemoryType MemoryType out_mem_type = static_cast( ProtoArgHelper::GetOptionalArg( diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc index fe89e810..7c7bb865 100644 --- a/mace/core/net_def_adapter.cc +++ b/mace/core/net_def_adapter.cc @@ -37,7 +37,7 @@ DataFormat GetDefaultDataFormat(DeviceType device_type, return DataFormat::NHWC; } else { LOG(FATAL) << "MACE do not support the device " << device_type; - return DataFormat::DF_NONE; + return DataFormat::NONE; } } @@ -50,19 +50,21 @@ std::string TransformedName(const std::string &input_name, return ss.str(); } +#ifdef MACE_ENABLE_OPENCL bool TransformRequiredOp(const std::string &op_type) { static const std::unordered_set kNoTransformOp = { "Shape", "InferConv2dShape" }; return kNoTransformOp.count(op_type) == 0; } +#endif // MACE_ENABLE_OPENCL void BuildTransposeOpDef( const std::string &input_name, const std::string &output_name, - const std::vector &output_shape, + const std::vector &output_shape, const std::vector dst_dims, - const mace::DataType dt, + const DataType dt, DeviceType device_type, OperatorDef *op_def) { std::string op_name = "mace_node_" + output_name; @@ -89,21 +91,13 @@ void BuildTransposeOpDef( } // namespace -NetDefAdapter::NetDefAdapter(const mace::OpRegistryBase *op_registry, - const mace::Workspace *ws) +NetDefAdapter::NetDefAdapter(const OpRegistryBase *op_registry, + const Workspace *ws) : op_registry_(op_registry), ws_(ws) {} -// Adapt original net_def to a better net. -// 1. Adapt device: choose best device for every op in the net. -// 2. Adapt data type: Add data type related transform ops -// for mixing precision. -// 3. Adapt data format: confirm data format of every op -// and add transpose if necessary. -// 4. Adapt memory type: Add BufferTransform if necessary -// for transforming memory type between ops. MaceStatus NetDefAdapter::AdaptNetDef( - const mace::NetDef *net_def, - mace::Device *target_device, + const NetDef *net_def, + Device *target_device, NetDef *target_net_def) { MACE_LATENCY_LOGGER(1, "Adapting original NetDef"); // Copy from original op_def, leave ops alone. @@ -115,7 +109,7 @@ MaceStatus NetDefAdapter::AdaptNetDef( std::unique_ptr cpu_device = make_unique( target_device->cpu_runtime()->num_threads(), target_device->cpu_runtime()->policy(), - target_device->cpu_runtime()->use_gemmlowp()); + &(target_device->cpu_runtime()->thread_pool())); // quantize model flag bool is_quantized_model = IsQuantizedModel(*net_def); @@ -131,40 +125,40 @@ MaceStatus NetDefAdapter::AdaptNetDef( std::vector(tensor.dims().begin(), tensor.dims().end()); } + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (target_device->device_type() == DeviceType::CPU) { + mem_type = MemoryType::CPU_BUFFER; + } else if (target_device->device_type() == DeviceType::GPU) { + mem_type = MemoryType::GPU_BUFFER; + } else { + LOG(FATAL) << "MACE do not support the device type: " + << target_device->device_type(); + } + int input_size = target_net_def->input_info_size(); for (int i = 0; i < input_size; ++i) { auto input_info = target_net_def->mutable_input_info(i); - MemoryType mem_type = MemoryType::CPU_BUFFER; - if (target_device->device_type() == DeviceType::CPU) { - mem_type = MemoryType::CPU_BUFFER; - } else if (target_device->device_type() == DeviceType::GPU) { - mem_type = MemoryType::GPU_BUFFER; - } else { - LOG(FATAL) << "MACE do not support the device type: " - << target_device->device_type(); - } - DataFormat input_data_format = static_cast( + auto input_data_format = static_cast( input_info->data_format()); DataFormat expected_data_format = GetDefaultDataFormat( target_device->device_type(), is_quantized_model); - std::vector input_shape = - std::vector(input_info->dims().begin(), - input_info->dims().end()); - if (input_data_format != DataFormat::DF_NONE + std::vector input_shape(input_info->dims().begin(), + input_info->dims().end()); + if (input_data_format != DataFormat::NONE && input_data_format != expected_data_format && input_shape.size() == 4) { if (input_data_format == DataFormat::NHWC && expected_data_format == DataFormat::NCHW) { - std::vector dst_dims = {0, 3, 1, 2}; + std::vector dst_dims{0, 3, 1, 2}; input_data_format = DataFormat::NCHW; input_shape = TransposeShape(input_shape, dst_dims); } else if (input_data_format == DataFormat::NCHW && expected_data_format == DataFormat::NHWC) { - std::vector dst_dims = {0, 2, 3, 1}; + std::vector dst_dims{0, 2, 3, 1}; input_data_format = DataFormat::NHWC; input_shape = TransposeShape(input_shape, dst_dims); } - input_info->set_data_format(input_data_format); + input_info->set_data_format(static_cast(input_data_format)); int input_shape_size = input_shape.size(); for (int j = 0; j < input_shape_size; ++j) { input_info->set_dims(j, input_shape[j]); @@ -287,9 +281,10 @@ MaceStatus NetDefAdapter::AdaptNetDef( internal_output_info.data_format, transformed_op_def); // set data format arg - SetProtoArg(transformed_op_def, - "data_format", - internal_output_info.data_format); + SetProtoArg( + transformed_op_def, + "data_format", + static_cast(internal_output_info.data_format)); // set output memory type argument SetProtoArg(transformed_op_def, OutputMemoryTypeTagName(), @@ -309,7 +304,7 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context, const TensorInfoMap &output_map, const NetDef *net_def, OperatorDef *op_def) { - VLOG(1) << "Adapt device for op " << op_def->name(); + VLOG(3) << "Adapt device for op " << op_def->name(); DeviceType target_device_type = target_device->device_type(); DeviceType device_type = DeviceType::CPU; context->set_device(cpu_device); @@ -335,15 +330,18 @@ MaceStatus NetDefAdapter::AdaptDevice(OpConditionContext *context, producer_devices); if (device_type == target_device_type) { context->set_device(target_device); + } else { + LOG(INFO) << "Op " << op_def->name() << " fall back to CPU"; } } op_def->set_device_type(device_type); return MaceStatus::MACE_SUCCESS; } -MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context, - mace::OperatorDef *op_def) { +MaceStatus NetDefAdapter::AdaptDataType(OpConditionContext *context, + OperatorDef *op_def) { MACE_UNUSED(context); + // Where to add logic to support mixing precision // Adjust data type of op ran on CPU DataType dtype = static_cast( ProtoArgHelper::GetOptionalArg( @@ -355,20 +353,20 @@ MaceStatus NetDefAdapter::AdaptDataType(mace::OpConditionContext *context, } MaceStatus NetDefAdapter::AdaptDataFormat( - mace::OpConditionContext *context, - mace::OperatorDef *op_def, + OpConditionContext *context, + OperatorDef *op_def, bool is_quantized_model, TensorInfoMap *output_map, std::unordered_set *transformed_set, DataFormat *op_output_df, - mace::NetDef *target_net_def) { - VLOG(1) << "Adapt data format for op " << op_def->name(); - MACE_UNUSED(context); + NetDef *target_net_def) { + VLOG(3) << "Adapt data format for op " << op_def->name(); DataFormat op_data_format = static_cast(ProtoArgHelper::GetOptionalArg( - *op_def, "data_format", 0)); + *op_def, "data_format", + static_cast(DataFormat::NONE))); // adjust the data format of operation - if (op_data_format == DataFormat::DF_AUTO) { + if (op_data_format == DataFormat::AUTO) { op_data_format = GetDefaultDataFormat( static_cast(op_def->device_type()), is_quantized_model); SetProtoArg(op_def, "data_format", static_cast(op_data_format)); @@ -376,14 +374,15 @@ MaceStatus NetDefAdapter::AdaptDataFormat( int output_shape_size = op_def->output_shape_size(); for (int i = 0; i < output_shape_size; ++i) { auto output_shape = op_def->mutable_output_shape(i); - if (output_shape->dims_size() == 4) { - // transpose output shape format from NHWC to NCHW - int64_t height = output_shape->dims(1); - int64_t width = output_shape->dims(2); - output_shape->set_dims(1, output_shape->dims(3)); - output_shape->set_dims(2, height); - output_shape->set_dims(3, width); - } + MACE_CHECK(output_shape->dims_size() == 4, + "Output shape should be 4D if the of has data format. ", + op_def->name()); + // transpose output shape format from NHWC to NCHW + int64_t height = output_shape->dims(1); + int64_t width = output_shape->dims(2); + output_shape->set_dims(1, output_shape->dims(3)); + output_shape->set_dims(2, height); + output_shape->set_dims(3, width); } } } @@ -394,8 +393,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat( if (op_def->device_type() == DeviceType::GPU) { target_mem_type = MemoryType::GPU_BUFFER; } - // Use op's data format as inputs' data format for now. - // Could move the logic to OpRegistry if necessary. + auto inputs_data_format = op_registry_->InputsDataFormat(op_def->type(), + context); DataFormat src_df, dst_df; int input_size = op_def->input_size(); for (int i = 0; i < input_size; ++i) { @@ -408,20 +407,21 @@ MaceStatus NetDefAdapter::AdaptDataFormat( continue; } src_df = output_map->at(op_def->input(i)).data_format; - dst_df = op_data_format; - if (src_df == DataFormat::DF_NONE - || dst_df == DataFormat::DF_NONE + dst_df = inputs_data_format[i]; + if (src_df == DataFormat::NONE + || dst_df == DataFormat::NONE || output_map->at(op_def->input(i)).shape.size() != 4) { continue; } if (src_df != dst_df) { std::string transformed_name = TransformedName(op_def->input(i), - "data_format", dst_df); + "data_format", static_cast(dst_df)); if (transformed_set->count(transformed_name) == 0) { VLOG(1) << "Add Transpose operation " << op_def->name() << " to transpose tensor " << op_def->input(i) << "', from data format " - << src_df << " to " << dst_df; + << static_cast(src_df) << " to " + << static_cast(dst_df); // Only support transpose between NHWC and NCHW for now. std::vector dst_dims; if (src_df == DataFormat::NCHW && dst_df == DataFormat::NHWC) { @@ -430,7 +430,8 @@ MaceStatus NetDefAdapter::AdaptDataFormat( dst_dims = {0, 3, 1, 2}; } else { LOG(FATAL) << "Encounter unsupported data format transpose from " - << src_df << " to " << dst_df; + << static_cast(src_df) << " to " + << static_cast(dst_df); } auto &input_info = output_map->at(op_def->input(i)); auto output_shape = input_info.shape.empty() ? @@ -449,7 +450,7 @@ MaceStatus NetDefAdapter::AdaptDataFormat( // set data format arg SetProtoArg(transpose_op_def, "data_format", - dst_df); + static_cast(dst_df)); // set output memory type argument SetProtoArg(transpose_op_def, OutputMemoryTypeTagName(), @@ -475,20 +476,20 @@ MaceStatus NetDefAdapter::AdaptDataFormat( } MaceStatus NetDefAdapter::AdaptMemoryType( - mace::OpConditionContext *context, - mace::OperatorDef *op_def, - mace::NetDefAdapter::TensorInfoMap *output_map, + OpConditionContext *context, + OperatorDef *op_def, + NetDefAdapter::TensorInfoMap *output_map, std::unordered_set *transformed_set, MemoryType *op_output_mem_types, - mace::NetDef *target_net_def) { - VLOG(1) << "Adapt memory type for op " << op_def->name(); + NetDef *target_net_def) { + VLOG(3) << "Adapt memory type for op " << op_def->name(); // Get expected output memory type // (only support one kind of memory type for multiple outputs) op_registry_->GetInOutMemoryTypes(op_def->type(), context); #ifdef MACE_ENABLE_OPENCL - int input_size = op_def->input_size(); // if op is memory-unused op, no transformation if (TransformRequiredOp(op_def->type())) { + int input_size = op_def->input_size(); for (int i = 0; i < input_size; ++i) { if (output_map->count(op_def->input(i)) == 0) { MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr @@ -498,14 +499,14 @@ MaceStatus NetDefAdapter::AdaptMemoryType( continue; } auto &input_info = output_map->at(op_def->input(i)); - if (input_info.data_format == DataFormat::DF_NONE - || input_info.shape.size() != 4) { - continue; - } // check whether to do transform MemoryType src_mem_type = input_info.mem_type; MemoryType dst_mem_type = context->GetInputMemType(i); - if (src_mem_type != dst_mem_type) { + auto wanted_input_dtype = context->GetInputDataType(i); + if (src_mem_type != dst_mem_type || + (input_info.dtype != wanted_input_dtype && + (src_mem_type != MemoryType::CPU_BUFFER + || dst_mem_type != MemoryType::CPU_BUFFER))) { auto transformed_name = TransformedName(op_def->input(i), "mem_type", dst_mem_type); @@ -521,7 +522,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType( op_def->input(i), input_info.shape, transformed_name, - context->GetInputDataType(i), + wanted_input_dtype, context->GetInputOpenCLBufferType(i), dst_mem_type, input_info.data_format, @@ -529,7 +530,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType( // set data format arg SetProtoArg(transformed_op_def, "data_format", - input_info.data_format); + static_cast(input_info.data_format)); // set output memory type argument SetProtoArg(transformed_op_def, OutputMemoryTypeTagName(), @@ -564,7 +565,7 @@ MaceStatus NetDefAdapter::AdaptMemoryType( return MaceStatus::MACE_SUCCESS; } -std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) { +std::string NetDefAdapter::DebugString(const NetDef *net_def) { std::stringstream sstream; auto DeviceTypeToStrFunc = [](DeviceType device_type) -> std::string { if (device_type == DeviceType::CPU) { @@ -591,10 +592,10 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) { return "NHWC"; } else if (type == DataFormat::NCHW) { return "NCHW"; - } else if (type == DataFormat::DF_NONE) { - return "DF_NONE"; - } else if (type == DataFormat::DF_AUTO) { - return "DT_AUTO"; + } else if (type == DataFormat::NONE) { + return "NONE"; + } else if (type == DataFormat::AUTO) { + return "AUTO"; } else if (type == DataFormat::OIHW) { return "OIHW"; } else { @@ -615,7 +616,7 @@ std::string NetDefAdapter::DebugString(const mace::NetDef *net_def) { std::string data_format = DataFormatToStrFunc( static_cast( ProtoArgHelper::GetOptionalArg( - op, "data_format", 0))); + op, "data_format", static_cast(DataFormat::NONE)))); sstream << std::endl; sstream << "{" << std::endl; diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h index 7f3a6754..d821ed81 100644 --- a/mace/core/net_def_adapter.h +++ b/mace/core/net_def_adapter.h @@ -32,16 +32,22 @@ class OpRegistryBase; class Workspace; class Device; -/** - * Conventions: - * 1. DataFormat::DT_AUTO stands for formatted (NHWC or NCHW) - * 2. if Op with DataFormat::DT_AUTO, the arguments of this op - * is formatted to NHWC - */ +/// Conventions: +/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW) +/// 2. if Op with DataFormat::AUTO, the arguments of this op +/// is formatted to NHWC class NetDefAdapter { public: NetDefAdapter(const OpRegistryBase *op_registry, const Workspace *ws); + // Adapt original net_def to a better net. + // 1. Adapt device: choose best device for every op in the net. + // 2. Adapt data type: Add data type related transform ops + // for mixing precision. + // 3. Adapt data format: confirm data format of every op + // and add transpose if necessary. + // 4. Adapt memory type: Add BufferTransform if necessary + // for transforming memory type between ops. MaceStatus AdaptNetDef( const NetDef *net_def, Device *target_device, @@ -91,12 +97,12 @@ class NetDefAdapter { NetDef *target_net_def); MaceStatus AdaptMemoryType( - mace::OpConditionContext *context, - mace::OperatorDef *op_def, + OpConditionContext *context, + OperatorDef *op_def, TensorInfoMap *output_map, std::unordered_set *transformed_set, MemoryType *op_output_mem_types, - mace::NetDef *target_net_def); + NetDef *target_net_def); std::string DebugString(const NetDef *net_def); diff --git a/mace/core/net_optimizer.cc b/mace/core/net_optimizer.cc index 565a42c1..4382b51b 100644 --- a/mace/core/net_optimizer.cc +++ b/mace/core/net_optimizer.cc @@ -19,10 +19,10 @@ namespace mace { DeviceType NetOptimizer::SelectBestDevice( - const mace::OperatorDef *op_def, + const OperatorDef *op_def, DeviceType target_device_type, - const std::set &available_devices, - const std::vector &inputs_op_devices) { + const std::set &available_devices, + const std::vector &inputs_op_devices) { static const std::set kComputeIntensiveOps = { "Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d", "FullyConnected" diff --git a/mace/core/net_optimizer.h b/mace/core/net_optimizer.h index 8ec8dc23..23f1897c 100644 --- a/mace/core/net_optimizer.h +++ b/mace/core/net_optimizer.h @@ -23,8 +23,21 @@ namespace mace { +/// Any optimization for Net could be put in here in the future. class NetOptimizer { public: + /// Select best device for the op to support mixing usage of CPU and GPU. + /// Greedy strategy: one way to the end. If the op fallback to CPU, then + /// the follow-up ops will run on CPU too util meet + /// some compute-intensive ops(Convolution) to + /// reduce the memory copy between CPU and GPU. + /// Simple but effective. + /// + /// \param op_def the op + /// \param target_device target device to run on + /// \param available_devices available devices of the op + /// \param inputs_op_devices devices of father ops run on + /// \return Best device for the op_def DeviceType SelectBestDevice(const OperatorDef *op_def, DeviceType target_device, const std::set &available_devices, diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 275189a7..605ae3a7 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -21,22 +21,22 @@ namespace mace { OpConditionContext::OpConditionContext( - const mace::Workspace *ws, - mace::OpConditionContext::TensorShapeMap *info) + const Workspace *ws, + OpConditionContext::TensorShapeMap *info) : operator_def_(nullptr), ws_(ws), device_(nullptr), tensor_shape_info_(info) {} void OpConditionContext::set_operator_def( - const mace::OperatorDef *operator_def) { + const OperatorDef *operator_def) { operator_def_ = operator_def; input_data_types_.clear(); } void OpConditionContext::SetInputInfo(size_t idx, - mace::MemoryType mem_type, - mace::DataType dt) { + MemoryType mem_type, + DataType dt) { if (input_mem_types_.empty()) { // the default inputs' memory types are same as output memory type. input_mem_types_.resize(operator_def_->input_size(), output_mem_type_); @@ -53,7 +53,7 @@ void OpConditionContext::SetInputInfo(size_t idx, input_data_types_[idx] = dt; } -void OpConditionContext::set_output_mem_type(mace::MemoryType type) { +void OpConditionContext::set_output_mem_type(MemoryType type) { MACE_CHECK(operator_def_ != nullptr); output_mem_type_ = type; input_mem_types_.clear(); @@ -106,7 +106,7 @@ OpConstructContext::OpConstructContext(Workspace *ws) device_(nullptr) {} void OpConstructContext::set_operator_def( - std::shared_ptr operator_def) { + std::shared_ptr operator_def) { operator_def_ = operator_def; } @@ -225,9 +225,20 @@ OpRegistrationInfo::OpRegistrationInfo() { context->set_output_mem_type(MemoryType::CPU_BUFFER); } }; + + data_format_selector = [](OpConditionContext *context) + -> std::vector { + DataFormat op_data_format = + static_cast( + ProtoArgHelper::GetOptionalArg( + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); + return std::vector(context->operator_def()->input_size(), + op_data_format); + }; } -void OpRegistrationInfo::AddDevice(mace::DeviceType device) { +void OpRegistrationInfo::AddDevice(DeviceType device) { devices.insert(device); } @@ -239,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) { MaceStatus OpRegistryBase::Register( const std::string &op_type, - const mace::DeviceType device_type, - const mace::DataType dt, - mace::OpRegistrationInfo::OpCreator creator) { + const DeviceType device_type, + const DataType dt, + OpRegistrationInfo::OpCreator creator) { if (registry_.count(op_type) == 0) { registry_[op_type] = std::unique_ptr( new OpRegistrationInfo); @@ -277,12 +288,20 @@ const std::set OpRegistryBase::AvailableDevices( void OpRegistryBase::GetInOutMemoryTypes( const std::string &op_type, - mace::OpConditionContext *context) const { + OpConditionContext *context) const { MACE_CHECK(registry_.count(op_type) != 0, op_type, " operation is not registered."); return registry_.at(op_type)->memory_type_setter(context); } +const std::vector OpRegistryBase::InputsDataFormat( + const std::string &op_type, + OpConditionContext *context) const { + MACE_CHECK(registry_.count(op_type) != 0, + op_type, " operation is not registered."); + return registry_.at(op_type)->data_format_selector(context); +} + std::unique_ptr OpRegistryBase::CreateOperation( OpConstructContext *context, DeviceType device_type) const { @@ -321,11 +340,17 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc( } OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter( - mace::OpRegistrationInfo::MemoryTypeSetter setter) { + OpRegistrationInfo::MemoryTypeSetter setter) { memory_type_setter_ = setter; return *this; } +OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector( + OpRegistrationInfo::DataFormatSelector selector) { + data_format_selector_ = selector; + return *this; +} + void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const { if (info != nullptr) { if (placer_) { @@ -334,6 +359,10 @@ void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const { if (memory_type_setter_) { info->memory_type_setter = memory_type_setter_; } + + if (data_format_selector_) { + info->data_format_selector = data_format_selector_; + } } } diff --git a/mace/core/operator.h b/mace/core/operator.h index 35effdc5..9430d90d 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -117,6 +117,14 @@ class OpConstructContext { inline Device *device() const { return device_; } +#ifdef MACE_ENABLE_OPENCL + inline MemoryType GetOpMemoryType() const { + return static_cast( + ProtoArgHelper::GetOptionalArg( + *operator_def_, OutputMemoryTypeTagName(), + static_cast(MemoryType::CPU_BUFFER))); + } +#endif // MACE_ENABLE_OPENCL private: std::shared_ptr operator_def_; @@ -270,6 +278,9 @@ class OpConditionBuilder { OpConditionBuilder &SetInputMemoryTypeSetter( OpRegistrationInfo::MemoryTypeSetter setter); + OpConditionBuilder &SetInputsDataFormatSelector( + OpRegistrationInfo::DataFormatSelector selector); + void Finalize(OpRegistrationInfo *info) const; private: @@ -297,6 +308,9 @@ class OpRegistryBase { void GetInOutMemoryTypes( const std::string &op_type, OpConditionContext *context) const; + const std::vector InputsDataFormat( + const std::string &op_type, OpConditionContext *context) const; + std::unique_ptr CreateOperation( OpConstructContext *context, DeviceType device_type) const; diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc index 9f9001f3..20ae6a2b 100644 --- a/mace/core/runtime/opencl/opencl_util.cc +++ b/mace/core/runtime/opencl/opencl_util.cc @@ -173,7 +173,7 @@ void OpenCLUtil::BuildTransformOpDef( arg->set_i(static_cast(dt)); arg = op_def->add_arg(); arg->set_name("data_format"); - arg->set_i(data_format); + arg->set_i(static_cast(data_format)); if (!input_shape.empty()) { OutputShape *shape = op_def->add_output_shape(); for (auto value : input_shape) { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index aa482bee..f1740765 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -269,7 +269,7 @@ MaceStatus Workspace::PreallocateOutputTensor( tensor_mem.second.data_type, false, tensor_mem.first)); tensor->set_data_format(tensor_mem.second.data_format); - if (tensor_mem.second.data_format != DataFormat::DF_NONE) { + if (tensor_mem.second.data_format != DataFormat::NONE) { if (mem_blocks[tensor_mem.second.mem_id].mem_type() == MemoryType::GPU_IMAGE) { VLOG(1) << "Tensor: " << tensor_mem.first diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index bbb7c710..054231e9 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } } diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc index d37a62b6..46896fcd 100644 --- a/mace/libmace/capability.cc +++ b/mace/libmace/capability.cc @@ -143,7 +143,7 @@ void BMNet::SetUp() { // Add input and output information for (size_t i = 0; i < input_names_.size(); ++i) { InputOutputInfo *info = net_.add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names_[i]); for (auto d : input_shapes_[i]) { info->add_dims(static_cast(d)); @@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type, op_def->add_output(output_name); AddIntsArg(op_def, "strides", strides); AddIntArg(op_def, "padding", padding_type); - AddIntArg(op_def, "has_data_format", 1); + AddIntArg(op_def, "data_format", static_cast(DataFormat::AUTO)); AddIntArg(op_def, "T", DT_HALF); if (has_relu6) { AddStringArg(op_def, "activation", "RELUX"); @@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name, op_def->add_output(output); AddIntArg(op_def, "type", type); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "has_data_format", 1); + AddIntArg(op_def, "data_format", static_cast(DataFormat::AUTO)); OutputShape *shape = op_def->add_output_shape(); for (auto dim : output_shape) { shape->add_dims(dim); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index f00ce2e6..08aaf9f3 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -283,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector &shape, std::shared_ptr data, const DataFormat format) { MACE_CHECK_NOTNULL(data.get()); - MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC - || format == DataFormat::NCHW || format == OIHW, - "MACE only support DF_NONE, NHWC, NCHW and OIHW " + MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC + || format == DataFormat::NCHW || format == DataFormat::OIHW, + "MACE only support NONE, NHWC, NCHW and OIHW " "formats of input now."); impl_ = make_unique(); impl_->shape = shape; @@ -496,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init( DataType output_dt = output_info_map_[output_name].data_type(); Tensor *output_tensor = ws_->CreateTensor(output_name, device_->allocator(), output_dt); - output_tensor->set_data_format(NHWC); + output_tensor->set_data_format(DataFormat::NHWC); #endif } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) @@ -585,14 +585,14 @@ MaceEngine::Impl::~Impl() { MaceStatus MaceEngine::Impl::TransposeInput( const std::pair &input, Tensor *input_tensor) { - bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE; - DataFormat data_format = DataFormat::DF_NONE; + bool has_data_format = input_tensor->data_format() != DataFormat::NONE; + DataFormat data_format = DataFormat::NONE; DataType input_dt = input_tensor->dtype(); if (has_data_format) { std::vector dst_dims; if (device_->device_type() == DeviceType::CPU && input.second.shape().size() == 4 && - input.second.data_format() == NHWC && + input.second.data_format() == DataFormat::NHWC && !is_quantized_model_) { VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; input_tensor->set_data_format(DataFormat::NCHW); @@ -654,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput( DataType output_dt = output_tensor->dtype(); // save output if (output_tensor != nullptr && output->second.data() != nullptr) { - if (output_tensor->data_format() != DataFormat::DF_NONE && - output->second.data_format() != DataFormat::DF_NONE && + if (output_tensor->data_format() != DataFormat::NONE && + output->second.data_format() != DataFormat::NONE && output->second.shape().size() == 4 && output->second.data_format() != output_tensor->data_format()) { VLOG(1) << "Transform output " << output->first << " from " - << output_tensor->data_format() << " to " - << output->second.data_format(); + << static_cast(output_tensor->data_format()) << " to " + << static_cast(output->second.data_format()); std::vector dst_dims; - if (output_tensor->data_format() == NCHW && - output->second.data_format() == NHWC) { + if (output_tensor->data_format() == DataFormat::NCHW && + output->second.data_format() == DataFormat::NHWC) { dst_dims = {0, 2, 3, 1}; - } else if (output_tensor->data_format() == NHWC && - output->second.data_format() == NCHW) { + } else if (output_tensor->data_format() == DataFormat::NHWC && + output->second.data_format() == DataFormat::NCHW) { dst_dims = {0, 3, 1, 2}; } else { LOG(FATAL) << "Not supported output data format: " - << output->second.data_format() << " vs " - << output_tensor->data_format(); + << static_cast(output->second.data_format()) << " vs " + << static_cast(output_tensor->data_format()); } VLOG(1) << "Transform output " << output->first << " from " - << output_tensor->data_format() << " to " - << output->second.data_format(); + << static_cast(output_tensor->data_format()) << " to " + << static_cast(output->second.data_format()); std::vector shape = TransposeShape(output_tensor->shape(), dst_dims); diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 1d697488..6cb21b5c 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -96,7 +96,7 @@ class ActivationOp : public Operation { auto leakyrelu_coefficient = static_cast( Operation::GetOptionalArg("leakyrelu_coefficient", 0.0f)); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( type, relux_max_limit, leakyrelu_coefficient); @@ -140,11 +140,13 @@ void RegisterActivation(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || - (op->output_shape_size() != op->output_size()) || op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index f16cf060..c2c95882 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -207,7 +207,8 @@ void TestSimplePrelu() { // Run net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Activation", "PreluTest") .Input("InputNCHW") .Input("Alpha") @@ -217,7 +218,8 @@ void TestSimplePrelu() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor( diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index d5175180..27bce71b 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -69,7 +69,7 @@ class AddNOp : public Operation { public: explicit AddNOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -109,11 +109,13 @@ void RegisterAddN(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || - (op->output_shape_size() != op->output_size()) || op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/fp32/deconv_2d.cc index a80d6d64..41a01a6c 100644 --- a/mace/ops/arm/fp32/deconv_2d.cc +++ b/mace/ops/arm/fp32/deconv_2d.cc @@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index c6559032..4e303d07 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -174,7 +174,7 @@ class BatchNormOp : public Operation { float leakyrelu_coefficient = Operation::GetOptionalArg( "leakyrelu_coefficient", 0.0f); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( epsilon, activation, relux_max_limit, leakyrelu_coefficient); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 495a2409..83c8219f 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -34,7 +34,8 @@ void Simple() { net.AddInputFromArray("Var", {1}, {11.67f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") .Input("Scale") @@ -47,7 +48,8 @@ void Simple() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BatchNorm", "BatchNormTest") .Input("Input") @@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BatchNorm", "BatchNormTest") @@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { net.AddRandomInput("Mean", {channels}, true); net.AddRandomInput("Var", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "BatchNormTest") .Input("InputNCHW") @@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index c44501f1..03ac91ff 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -264,7 +264,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { public: explicit BatchToSpaceNDOp(OpConstructContext *context) : BatchToSpaceOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 7991a088..72e93fec 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -103,7 +103,7 @@ class BiasAddOp : public Operation { : Operation(context), has_data_format_(Operation::GetOptionalArg("has_data_format", 1)) { MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { @@ -151,11 +151,13 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || - (op->output_shape_size() != op->output_size()) || op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 34f6a713..8c51b703 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { OpsTestNet net; // Add input data - DataFormat data_format = NHWC; if (D == DeviceType::CPU) { - data_format = NCHW; net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 2e4764ca..0126abb9 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -31,8 +31,8 @@ void BiasAddSimple() { net.AddInputFromArray("Bias", {1}, {0.5f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") @@ -41,8 +41,8 @@ void BiasAddSimple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("Input") @@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { {batch, height, width, channels}); net.AddRandomInput("Bias", {channels}, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BiasAdd", "BiasAddTest") @@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { {batch, height, width, channels}); net.AddRandomInput("Bias", {channels}, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("BiasAdd", "BiasAddTest") @@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 92733d61..2a8c42b3 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -48,7 +48,6 @@ void FilterBufferToImage(int iters, OpenCLBufferType::IN_OUT_CHANNEL, MemoryType::GPU_IMAGE, 0, - DataFormat::NHWC, b2i_output); }; diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index a819b6a7..cb52eafe 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DT_FLOAT); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type, // Transform OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output); + type, MemoryType::GPU_IMAGE, 0, b2i_output); // Inverse Transform Tensor *i2b_output = net.ws()->CreateTensor( "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) .Transform(&context, b2i_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output); + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index f8bf025d..7e59b339 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -39,14 +39,11 @@ class BufferTransformOp : public Operation { auto type = static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(CONV2D_FILTER))); - DataFormat data_format = static_cast( - Operation::GetOptionalArg("data_format", DataFormat::DF_NONE)); MemoryType in_mem_type = context->workspace()->GetTensor( operator_def_->input(0))->memory_type(); return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( - context, input, type, out_mem_type_, wino_blk_size_, - data_format, output); + context, input, type, out_mem_type_, wino_blk_size_, output); } private: diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc index b3f68a31..a9af4bc9 100644 --- a/mace/ops/buffer_transform_test.cc +++ b/mace/ops/buffer_transform_test.cc @@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output); + type, MemoryType::GPU_BUFFER, 0, bt_output); // Inverse Transform Tensor *output = net.ws()->CreateTensor( @@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type, OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_BUFFER) .Transform(&context, bt_output, - type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output); + type, MemoryType::GPU_BUFFER, 0, output); if (DataTypeToEnum::value == DataTypeToEnum::value) { EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), @@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) { MemoryType::GPU_BUFFER) .Transform(&context, net.ws()->GetTensor("Input"), OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER, - 0, DataFormat::NHWC, output); + 0, output); index_t expected_size = RoundUp(input_size, 4); EXPECT_EQ(expected_size, output->buffer_shape()[0]); diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 09811828..d68ebbbe 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -82,7 +82,7 @@ class ChannelShuffleOp : public Operation { explicit ChannelShuffleOp(OpConstructContext *context) : Operation(context) { const int groups = Operation::GetOptionalArg("group", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(groups); } else { MACE_NOT_IMPLEMENTED; @@ -119,7 +119,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU }; + return { DeviceType::CPU, DeviceType::GPU }; } int groups = ProtoArgHelper::GetOptionalArg( *op, "group", 1); diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index d59b45d8..4e25448b 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { "Input", {1, 1, 2, 8}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") @@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor( diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc index 2ca95a7d..43988881 100644 --- a/mace/ops/common/conv_pool_2d_util.cc +++ b/mace/ops/common/conv_pool_2d_util.cc @@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, index_t input_height = 0, input_width = 0; index_t kernel_height = 0, kernel_width = 0; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { input_height = input_shape[2]; input_width = input_shape[3]; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { input_height = input_shape[1]; input_width = input_shape[2]; } else { MACE_NOT_IMPLEMENTED; } - if (filter_format == OIHW) { + if (filter_format == DataFormat::OIHW) { kernel_height = filter_shape[2]; kernel_width = filter_shape[3]; - } else if (filter_format == OHWI) { + } else if (filter_format == DataFormat::OHWI) { kernel_height = filter_shape[1]; kernel_width = filter_shape[2]; } else { @@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, 0, (output_width - 1) * strides[1] + k_extent_width - input_width); output_shape[0] = input_shape[0]; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { output_shape[1] = output_channels; output_shape[2] = output_height; output_shape[3] = output_width; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { output_shape[1] = output_height; output_shape[2] = output_width; output_shape[3] = output_channels; @@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW Padding padding, index_t *output_shape, int *padding_size) { - CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations, + CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape, + DataFormat::OIHW, dilations, strides, padding, output_shape, padding_size); } @@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC Padding padding, index_t *output_shape, int *padding_size) { - CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations, + CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape, + DataFormat::OIHW, dilations, strides, padding, output_shape, padding_size); } @@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape, index_t input_height = 0, input_width = 0; index_t kernel_height = 0, kernel_width = 0; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { input_height = input_shape[2]; input_width = input_shape[3]; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { input_height = input_shape[1]; input_width = input_shape[2]; } else { MACE_NOT_IMPLEMENTED; } - if (filter_format == OIHW) { + if (filter_format == DataFormat::OIHW) { kernel_height = filter_shape[2]; kernel_width = filter_shape[3]; - } else if (filter_format == OHWI) { + } else if (filter_format == DataFormat::OHWI) { kernel_height = filter_shape[1]; kernel_width = filter_shape[2]; } else { @@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape, } output_shape[0] = input_shape[0]; - if (input_format == NCHW) { + if (input_format == DataFormat::NCHW) { output_shape[1] = output_channels; output_shape[2] = output_height; output_shape[3] = output_width; - } else if (input_format == NHWC) { + } else if (input_format == DataFormat::NHWC) { output_shape[1] = output_height; output_shape[2] = output_width; output_shape[3] = output_channels; @@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape, // NHWC const int *strides, const RoundType round_type, index_t *output_shape) { - CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations, + CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape, + DataFormat::OIHW, padding_size, dilations, strides, round_type, output_shape); } @@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW const int *strides, const RoundType round_type, index_t *output_shape) { - CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations, + CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape, + DataFormat::OIHW, padding_size, dilations, strides, round_type, output_shape); } @@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector &input_shape, std::vector *padded_out_shape, DataFormat data_format) { const index_t - in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + in_height = + data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1]; const index_t - in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + in_width = + data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2]; const index_t - out_height = data_format == NCHW ? output_shape[2] : output_shape[1]; + out_height = + data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1]; const index_t - out_width = data_format == NCHW ? output_shape[3] : output_shape[2]; + out_width = + data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2]; const index_t extended_in_height = (in_height - 1) * strides[0] + 1; const index_t extended_in_width = (in_width - 1) * strides[1] + 1; @@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector &input_shape, padded_out_shape->resize(4); (*padded_out_shape)[0] = output_shape[0]; (*padded_out_shape)[1] = - data_format == NCHW ? output_channel : padded_out_height; + data_format == DataFormat::NCHW ? output_channel : padded_out_height; (*padded_out_shape)[2] = - data_format == NCHW ? padded_out_height : padded_out_width; + data_format == DataFormat::NCHW ? padded_out_height : padded_out_width; (*padded_out_shape)[3] = - data_format == NCHW ? padded_out_width : output_channel; + data_format == DataFormat::NCHW ? padded_out_width : output_channel; } } @@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, std::vector *padded_out_shape, DataFormat data_format) { const index_t - in_height = data_format == NCHW ? input_shape[2] : input_shape[1]; + in_height = + data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1]; const index_t - in_width = data_format == NCHW ? input_shape[3] : input_shape[2]; + in_width = + data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2]; const index_t output_channel = filter_shape[0] * group; @@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, padded_out_shape->resize(4); (*padded_out_shape)[0] = input_shape[0]; (*padded_out_shape)[1] = - data_format == NCHW ? output_channel : padded_out_height; + data_format == DataFormat::NCHW ? output_channel : padded_out_height; (*padded_out_shape)[2] = - data_format == NCHW ? padded_out_height : padded_out_width; + data_format == DataFormat::NCHW ? padded_out_height : padded_out_width; (*padded_out_shape)[3] = - data_format == NCHW ? padded_out_width : output_channel; + data_format == DataFormat::NCHW ? padded_out_width : output_channel; } if (out_shape != nullptr) { @@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector &input_shape, index_t out_width = padded_out_width - out_pad_size[1]; out_shape->resize(4); (*out_shape)[0] = input_shape[0]; - (*out_shape)[1] = data_format == NCHW ? output_channel : out_height; - (*out_shape)[2] = data_format == NCHW ? out_height : out_width; - (*out_shape)[3] = data_format == NCHW ? out_width : output_channel; + (*out_shape)[1] = + data_format == DataFormat::NCHW ? output_channel : out_height; + (*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width; + (*out_shape)[3] = + data_format == DataFormat::NCHW ? out_width : output_channel; } } @@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector &input_shape, MACE_CHECK(output_shape->size() == 4, "deconv output shape shoud be 4-dims"); std::vector &out_shape = *output_shape; - if (data_format == NCHW) { + if (data_format == DataFormat::NCHW) { const index_t t = out_shape[1]; out_shape[1] = out_shape[3]; out_shape[3] = out_shape[2]; diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index d2bb5713..518e9cc2 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -199,7 +199,7 @@ class ConcatOp : public ConcatOpBase { public: explicit ConcatOp(OpConstructContext *context) : ConcatOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -243,9 +243,11 @@ void RegisterConcat(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } auto tensor_shape_info = context->tensor_shape_info(); - if (op->output_shape_size() != op->output_size() || - op->output_shape(0).dims_size() != 4) { + if (op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } else { int has_data_format = diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 80e8fe78..cc84b963 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -231,9 +231,9 @@ class Conv2dOp : public ConvPool2dOpBase { std::vector paddings(2); if (paddings_.empty()) { CalcPaddingAndOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, filter->shape().data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -242,9 +242,9 @@ class Conv2dOp : public ConvPool2dOpBase { } else { paddings = paddings_; CalcOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, filter->shape().data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -459,7 +459,7 @@ class Conv2dOp : public ConvPool2dOpBase { "leakyrelu_coefficient", 0.0f)), wino_block_size_(Operation::GetOptionalArg("wino_block_size", 0)) { MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 7fb85478..42929057 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) { const std::vector output_shape = {1, 3, 3, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -256,8 +256,8 @@ void TestNHWCCombined3x3() { net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -270,8 +270,8 @@ void TestNHWCCombined3x3() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") .Input("Filter") @@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) { const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -459,8 +459,8 @@ void TestConv1x1() { net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputNCHW") .Input("Filter") @@ -472,8 +472,8 @@ void TestConv1x1() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true, false); net.AddRandomInput("Bias", {output_channels}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") @@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, float_bias_data, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") @@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", {output_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Construct graph OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch, net.AddRandomInput("Filter", {out_channels, k_height, k_width, in_channels}, true); net.AddRandomInput("Bias", {out_channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.TransformFilterDataFormat("Filter", - OHWI, + DataFormat::OHWI, "FilterOIHW", - OIHW); + DataFormat::OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch, .AddIntArg("T", static_cast(DT_FLOAT)) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeFilter") .Input("Filter") diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc index 9cb836ee..20146c8d 100644 --- a/mace/ops/crop.cc +++ b/mace/ops/crop.cc @@ -117,7 +117,7 @@ class CropOp : public Operation { public: explicit CropOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( Operation::GetRepeatedArgs("offset")); } else { @@ -151,11 +151,13 @@ void RegisterCrop(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } int has_data_format = ProtoArgHelper::GetOptionalArg( *op, "has_data_format", 0); if (!has_data_format || - (op->output_shape_size() != op->output_size()) || op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index 213b8ce8..0fd0026b 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -42,13 +42,13 @@ void RunCrop(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); } else if (D == CPU) { net.TransformDataFormat("Input0", - NHWC, + DataFormat::NHWC, "InputNCHW0", - NCHW); + DataFormat::NCHW); net.TransformDataFormat("Input1", - NHWC, + DataFormat::NHWC, "InputNCHW1", - NCHW); + DataFormat::NCHW); OpDefBuilder("Crop", "CropTest") .Input("InputNCHW0") .Input("InputNCHW1") @@ -62,8 +62,8 @@ void RunCrop(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check auto expected = net.CreateTensor(expected_shape, expected_data); diff --git a/mace/ops/cumsum_test.cc b/mace/ops/cumsum_test.cc index 8b111540..69e62965 100644 --- a/mace/ops/cumsum_test.cc +++ b/mace/ops/cumsum_test.cc @@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector &shape, OpsTestNet net; net.AddInputFromArray("Input", shape, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Cumsum", "CumsumTest") .Input("InputNCHW") @@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector &shape, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 3ac54186..2b7623e6 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -173,7 +173,7 @@ class Deconv2dOp : public Deconv2dOpBase { explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -240,7 +240,7 @@ class Deconv2dOp : public Deconv2dOpBase { &out_paddings, nullptr, model_type_, - NHWC); + DataFormat::NHWC); return kernel_->Compute(context, input, filter, bias, strides_.data(), in_paddings.data(), activation_, @@ -276,7 +276,7 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) { MACE_NOT_IMPLEMENTED; } FrameworkType framework_type = - static_cast( + static_cast( ProtoArgHelper::GetOptionalArg( *(context->operator_def()), "framework_type", FrameworkType::TENSORFLOW)); diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 25aa7eee..9ea8161e 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -47,7 +47,8 @@ void RunTestSimple(const std::vector &input_shape, net.AddInputFromArray("Filter", filter_shape, filter_data, true); net.AddInputFromArray("Bias", {out_channels}, bias_data, true); // TODO(liutuo): remove the unused transform - net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.TransformFilterDataFormat( + "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW); if (D == DeviceType::GPU) { if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") @@ -77,8 +78,8 @@ void RunTestSimple(const std::vector &input_shape, } net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); if (model_type == FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") @@ -109,8 +110,8 @@ void RunTestSimple(const std::vector &input_shape, // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(expected_shape, expected_data); @@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch, "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true, false); net.AddRandomInput("Bias", {output_channels}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); int out_h = 0; int out_w = 0; @@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index 09208e7a..a57ddecf 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -96,7 +96,7 @@ class DepthToSpaceOp : public Operation { explicit DepthToSpaceOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index 2719619f..65fb7d39 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); // Construct graph if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("DepthToSpace", "DepthToSpaceTest") @@ -114,8 +114,8 @@ void RandomTest(const int block_size, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("InputNCHW") .AddIntArg("block_size", block_size) @@ -125,8 +125,8 @@ void RandomTest(const int block_size, // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") .Input("Input") diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 7d389766..ae2a4dfd 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -188,9 +188,9 @@ class DepthwiseConv2dOp filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1}; if (paddings_.empty()) { CalcPaddingAndOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, ohwi_shape.data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -199,9 +199,9 @@ class DepthwiseConv2dOp } else { paddings = paddings_; CalcOutputSize(input->shape().data(), - NHWC, + DataFormat::NHWC, ohwi_shape.data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -375,7 +375,7 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { explicit DepthwiseConv2dOp(OpConstructContext *context) : DepthwiseConv2dOpBase(context) { MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { @@ -459,6 +459,18 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { context->set_output_mem_type(mem_type); })); #endif // MACE_ENABLE_OPENCL + MACE_REGISTER_OP_CONDITION( + op_registry, + OpConditionBuilder("DepthwiseConv2d") + .SetInputsDataFormatSelector( + [](OpConditionContext *context) -> std::vector { + DataFormat op_data_format = + static_cast( + ProtoArgHelper::GetOptionalArg( + *context->operator_def(), "data_format", + static_cast(DataFormat::NONE))); + return {op_data_format, DataFormat::OIHW, DataFormat::NONE}; + })); } } // namespace ops diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 58852a01..d34722a5 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -39,8 +39,8 @@ void SimpleValidTest() { true); net.AddInputFromArray("Bias", {2}, {.1f, .2f}, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -52,8 +52,8 @@ void SimpleValidTest() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") @@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") @@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) { {multiplier * channel}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") .Input("Filter") @@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) { // Run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -389,9 +389,9 @@ void TestQuant(const index_t batch, "Filter", {k_height, k_width, in_channels, multiplier}, true, false); net.AddRandomInput("Bias", {out_channels}, true); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.TransformFilterDataFormat( - "Filter", HWIO, "FilterOIHW", OIHW); + "Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("InputNCHW") @@ -405,7 +405,7 @@ void TestQuant(const index_t batch, .Finalize(net.NewOperatorDef()); net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeFilter") .Input("Filter") diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index 6111ea30..31b634af 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { explicit DepthwiseDeconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; @@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { &out_paddings, nullptr, CAFFE, - NHWC); + DataFormat::NHWC); return kernel_->Compute(context, input, diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc index 0cf3de95..fda0cf59 100644 --- a/mace/ops/depthwise_deconv2d_test.cc +++ b/mace/ops/depthwise_deconv2d_test.cc @@ -39,7 +39,8 @@ void RunTestSimple(const int group, // Add input data net.AddInputFromArray("Input", input_shape, input_data); net.AddInputFromArray("Filter", filter_shape, filter_data, true); - net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.TransformFilterDataFormat( + "Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW); const index_t out_channels = expected_shape[3]; net.AddInputFromArray("Bias", {out_channels}, bias_data, true); @@ -56,8 +57,8 @@ void RunTestSimple(const int group, net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, - "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") .Input("InputNCHW") .Input("FilterOIHW") @@ -69,8 +70,8 @@ void RunTestSimple(const int group, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(expected_shape, expected_data); @@ -193,8 +194,8 @@ void RandomTest(index_t batch, {channel * multiplier}, bias_data, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") .Input("InputNCHW") .Input("Filter") @@ -210,8 +211,8 @@ void RandomTest(index_t batch, .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 04c0e10e..bfe00742 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -1145,7 +1145,7 @@ class EltwiseOp : public Operation { int32_t scalar_input_index = Operation::GetOptionalArg( "scalar_input_index", 1); MemoryType mem_type; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>( type, coeff, scalar_input, scalar_input_index); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 58306b62..08dc11d0 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type, net.AddInputFromArray("Input", shape, input); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "TInput", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput") .AddIntArg("T", DataTypeToEnum::v()) @@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input") @@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type, .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput"); if (shape0.size() > 1) { - net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); op_builder.Input("TInput0"); } else { op_builder.Input("Input0"); } if (shape1.size() > 1) { - net.TransformDataFormat("Input1", NHWC, "TInput1", NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); op_builder.Input("TInput1"); } else { op_builder.Input("Input1"); @@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type, // Run net.RunOp(D); - net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Eltwise", "EltwiseTest") .Input("Input0") @@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type, net.AddInputFromArray( "Filter", {output_shape.back(), shape0.back(), 3, 3}, - std::vector(output_shape.back() * shape0.back() * 9, 1)); + std::vector(output_shape.back() * shape0.back() * 9, 1), + true); OpDefBuilder("Conv2D", "Conv2D") .AddIntArg("T", DataTypeToEnum::v()) .Input("EltOutput") @@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type, // Add input data net.AddRandomInput("Input", shape, false, true, true); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput") .AddIntArg("type", static_cast(type)) @@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type, .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type, true, true); - net.TransformDataFormat("Input0", NHWC, "TInput0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "TInput1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") .Input("TInput1") @@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -746,10 +752,10 @@ void Quantized(const std::vector &shape, true, true); - net.TransformDataFormat("Input0", NHWC, "TInput0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "TInput1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW); OpDefBuilder("Eltwise", "EltwiseTest") .Input("TInput0") @@ -761,8 +767,8 @@ void Quantized(const std::vector &shape, // Run net.RunOp(DeviceType::CPU); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput0") .Input("Input0") diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 5be44e05..fb0c45bb 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -49,7 +49,8 @@ void Simple() { net.AddInputFromArray("Offset", {1}, offset, true); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") .Input("Scale") @@ -58,7 +59,8 @@ void Simple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("Input") @@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { net.AddRandomInput("Scale", {channels}, true); net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchNorm", "FoldedBatchNormTest") .Input("InputNCHW") @@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 64765d9c..9a371b16 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -190,7 +190,7 @@ class FullyConnectedOp : public FullyConnectedOpBase { explicit FullyConnectedOp(OpConstructContext *context) : FullyConnectedOpBase(context) { MemoryType mem_type = MemoryType::CPU_BUFFER; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { mem_type = MemoryType::GPU_IMAGE; kernel_ = make_unique>(); } else { diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 64fead6e..586eb166 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -48,7 +48,8 @@ void Simple(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("Input") @@ -129,8 +130,8 @@ void Random(const index_t batch, net.AddRandomInput("Bias", {out_channel}, true, false); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputNCHW") .Input("Weight") @@ -143,7 +144,8 @@ void Random(const index_t batch, // run cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor(); @@ -215,8 +217,10 @@ void QuantRandom(const index_t batch, net.AddRandomInput( "Weight", {out_channel, height, width, channels}, true); net.AddRandomInput("Bias", {out_channel}, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformFilterDataFormat("Weight", OHWI, "WeightOIHW", OIHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); + net.TransformFilterDataFormat( + "Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputNCHW") @@ -226,7 +230,8 @@ void QuantRandom(const index_t batch, .AddIntArg("T", DT_FLOAT) .Finalize(net.NewOperatorDef()); net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeWeight") .Input("Weight") diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc index e3597006..9a2d2cdf 100644 --- a/mace/ops/local_response_norm_test.cc +++ b/mace/ops/local_response_norm_test.cc @@ -29,7 +29,8 @@ void Simple() { {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest") .Input("InputNCHW") @@ -41,7 +42,8 @@ void Simple() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc index 82ed9053..d43dbf6b 100644 --- a/mace/ops/lstm_cell.cc +++ b/mace/ops/lstm_cell.cc @@ -36,7 +36,7 @@ class LSTMCellOp : public Operation { Operation::GetOptionalArg("scalar_input", 0.0)); MemoryType mem_type = MemoryType::GPU_IMAGE; - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(forget_bias); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index d5dca3d7..d2ef5058 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -47,7 +47,6 @@ class OpenCLBufferTransformer { const OpenCLBufferType type, const MemoryType out_mem_type, const int wino_blk_size, - DataFormat data_format, Tensor *output) { Workspace *ws = context->workspace(); DataType dt = DataTypeToEnum::value; @@ -66,7 +65,6 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform CPU Buffer " << input->name() << " to GPU Buffer " << internal_tensor->name() << " with data type " << dt; - MACE_CHECK(data_format == DataFormat::NHWC); internal_tensor->Resize(input->shape()); const uint8_t *input_ptr = input->data(); Tensor::MappingGuard guard(internal_tensor); @@ -88,7 +86,6 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() << " to CPU Buffer " << output->name() << " with data type " << dt; - MACE_CHECK(data_format == DataFormat::NHWC); Tensor::MappingGuard guard(&internal_tensor); const T *internal_ptr = internal_tensor.data(); output->Resize(internal_tensor.shape()); @@ -135,7 +132,7 @@ MaceStatus TransformFilter( input->MarkUnused(); return OpenCLBufferTransformer(input->memory_type(), mem_type). Transform(&op_context, input, buffer_type, mem_type, wino_blk_size, - DataFormat::DF_NONE, output); + output); } } // namespace ops diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index ab61e8c6..aa98275c 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -15,6 +15,7 @@ #include "mace/ops/ops_test_util.h" #include "mace/core/memory_optimizer.h" #include "mace/utils/memory.h" +#include "mace/core/net_def_adapter.h" namespace mace { namespace ops { @@ -164,26 +165,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() { bool OpsTestNet::Setup(mace::DeviceType device) { NetDef net_def; for (auto &op_def : op_defs_) { - net_def.add_op()->CopyFrom(op_def); - + auto target_op = net_def.add_op(); + target_op->CopyFrom(op_def); + + auto has_data_format = ProtoArgHelper::GetOptionalArg( + op_def, "has_data_format", 0); + auto is_quantized_op = ProtoArgHelper::GetOptionalArg( + op_def, "T", static_cast(DT_FLOAT)) + == static_cast(DT_UINT8); for (auto input : op_def.input()) { if (ws_.GetTensor(input) != nullptr && !ws_.GetTensor(input)->is_weight()) { auto input_info = net_def.add_input_info(); input_info->set_name(input); - auto has_data_format = ProtoArgHelper::GetOptionalArg( - op_def, "has_data_format", 1); - auto is_quantized_op = ProtoArgHelper::GetOptionalArg( - op_def, "T", static_cast(DT_FLOAT)) - == static_cast(DT_UINT8); if (has_data_format) { if (is_quantized_op || device == DeviceType::GPU) { - input_info->set_data_format(NHWC); + input_info->set_data_format(static_cast(DataFormat::NHWC)); } else { - input_info->set_data_format(NCHW); + input_info->set_data_format(static_cast(DataFormat::NCHW)); } } else { - input_info->set_data_format(DataFormat::DF_NONE); + input_info->set_data_format(static_cast(DataFormat::NONE)); } auto &shape = ws_.GetTensor(input)->shape(); for (auto d : shape) { @@ -191,6 +193,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } + if (has_data_format) { + SetProtoArg(target_op, "data_format", + static_cast(DataFormat::AUTO)); + } } if (!op_defs_.empty()) { auto op_def = op_defs_.back(); @@ -205,15 +211,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) { } } } + NetDef adapted_net_def; + NetDefAdapter net_def_adapter(op_registry_.get(), &ws_); + net_def_adapter.AdaptNetDef(&net_def, + OpTestContext::Get()->GetDevice(device), + &adapted_net_def); + MemoryOptimizer mem_optimizer; net_ = make_unique( op_registry_.get(), - &net_def, + &adapted_net_def, &ws_, OpTestContext::Get()->GetDevice(device), &mem_optimizer); MaceStatus status = (ws_.PreallocateOutputTensor( - net_def, + adapted_net_def, &mem_optimizer, OpTestContext::Get()->GetDevice(device))); if (status != MaceStatus::MACE_SUCCESS) return false; @@ -252,15 +264,20 @@ MaceStatus OpsTestNet::RunOp() { MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def, const mace::DeviceType device) { device_type_ = device; + NetDef adapted_net_def; + NetDefAdapter net_def_adapter(op_registry_.get(), &ws_); + net_def_adapter.AdaptNetDef(&net_def, + OpTestContext::Get()->GetDevice(device), + &adapted_net_def); MemoryOptimizer mem_optimizer; net_ = make_unique( op_registry_.get(), - &net_def, + &adapted_net_def, &ws_, OpTestContext::Get()->GetDevice(device), &mem_optimizer); MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor( - net_def, + adapted_net_def, &mem_optimizer, OpTestContext::Get()->GetDevice(device))); MACE_RETURN_IF_ERROR(net_->Init()); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index e9ef4d90..8d94f51f 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -216,7 +216,7 @@ class OpsTestNet { const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); - if (src_format == NHWC && dst_format == NCHW) { + if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) { index_t batch = input_shape[0]; index_t height = input_shape[1]; index_t width = input_shape[2]; @@ -236,7 +236,8 @@ class OpsTestNet { } } } - } else if (src_format == NCHW && dst_format == NHWC) { + } else if (src_format == DataFormat::NCHW && + dst_format == DataFormat::NHWC) { index_t batch = input_shape[0]; index_t channels = input_shape[1]; index_t height = input_shape[2]; @@ -274,7 +275,7 @@ class OpsTestNet { input->is_weight()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); - if (src_format == HWOI && dst_format == OIHW) { + if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) { index_t height = input_shape[0]; index_t width = input_shape[1]; index_t out_channels = input_shape[2]; @@ -292,7 +293,8 @@ class OpsTestNet { input_data[j * out_channels * in_channels + i]; } } - } else if (src_format == OIHW && dst_format == HWOI) { + } else if (src_format == DataFormat::OIHW && + dst_format == DataFormat::HWOI) { index_t out_channels = input_shape[0]; index_t in_channels = input_shape[1]; index_t height = input_shape[2]; @@ -310,7 +312,8 @@ class OpsTestNet { input_data[j * height * width + i]; } } - } else if (src_format == HWIO && dst_format == OIHW) { + } else if (src_format == DataFormat::HWIO && + dst_format == DataFormat::OIHW) { index_t height = input_shape[0]; index_t width = input_shape[1]; index_t in_channels = input_shape[2]; @@ -330,7 +333,8 @@ class OpsTestNet { } } } - } else if (src_format == OHWI && dst_format == OIHW) { + } else if (src_format == DataFormat::OHWI && + dst_format == DataFormat::OIHW) { index_t out_channels = input_shape[0]; index_t height = input_shape[1]; index_t width = input_shape[2]; diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index e0a94f4a..24130d7a 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -179,7 +179,7 @@ class PadOp : public Operation { std::vector paddings = Operation::GetRepeatedArgs("paddings"); float constant_value = Operation::GetOptionalArg( "constant_value", 0.0); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( type, paddings, constant_value); } else { diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index e68e8eb8..97730559 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -45,8 +45,8 @@ void SimpleConstant() { // Run net.RunOp(D); } else { - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -58,8 +58,8 @@ void SimpleConstant() { // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto output = net.GetTensor("Output"); @@ -93,7 +93,8 @@ void Result(const std::vector &input_shape, if (D == DeviceType::CPU) { t_input = "TInput"; t_output = "TOutput"; - net.TransformDataFormat(input, NHWC, t_input, NCHW); + net.TransformDataFormat( + input, DataFormat::NHWC, t_input, DataFormat::NCHW); } OpDefBuilder("Pad", "PadTest") @@ -108,7 +109,8 @@ void Result(const std::vector &input_shape, net.RunOp(D); if (D == DeviceType::CPU) { - net.TransformDataFormat(t_output, NCHW, output, NHWC); + net.TransformDataFormat( + t_output, DataFormat::NCHW, output, DataFormat::NHWC); } auto actual = net.GetTensor(output.c_str()); @@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) { // Add input data net.AddRepeatedInput("Input", {1, 1, 1, 2}, 2); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) { // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto output = net.GetTensor("Output"); @@ -209,8 +211,8 @@ void Complex(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "TInput", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "TInput", DataFormat::NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") @@ -222,8 +224,8 @@ void Complex(const std::vector &input_shape, // Run net.RunOp(); - net.TransformDataFormat("TOutput", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 21d02e14..ce726dcb 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -270,9 +270,9 @@ class PoolingOp : public PoolingOpBase { std::vector paddings(2); if (paddings_.empty()) { CalcPaddingAndOutputSize(input_tensor->shape().data(), - NHWC, + DataFormat::NHWC, filter_shape.data(), - OHWI, + DataFormat::OHWI, dilations_.data(), strides_.data(), padding_type_, @@ -281,9 +281,9 @@ class PoolingOp : public PoolingOpBase { } else { paddings = paddings_; CalcOutputSize(input_tensor->shape().data(), - NHWC, + DataFormat::NHWC, filter_shape.data(), - OHWI, + DataFormat::OHWI, paddings_.data(), dilations_.data(), strides_.data(), @@ -477,7 +477,7 @@ class PoolingOp : public PoolingOpBase { public: explicit PoolingOp(OpConstructContext *context) : PoolingOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { kernel_ = make_unique>(); diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 104b67bc..037cf8cf 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) { {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = @@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) { net.AddInputFromArray("Input", {1, 3, 3, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); @@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { "Input", {1, 4, 4, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); @@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { "Input", {1, 2, 9, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); @@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); // Run OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() { .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else if (D == DeviceType::GPU) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") @@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector &input_shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) { {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor( @@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector &shape, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") @@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector &shape, // run on cpu net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -563,7 +563,7 @@ void TestQuant(const index_t batch, net.AddRandomInput( "Input", input_shape, false, false); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddRandomInput( "OutputNCHW", input_shape, false, true, true); @@ -580,7 +580,7 @@ void TestQuant(const index_t batch, net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput") .Input("Input") diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index 86964ed9..27b34a91 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -873,7 +873,7 @@ class ReduceOp : public ReduceOpBase { public: explicit ReduceOp(OpConstructContext *context) : ReduceOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(reduce_type_, axis_, keep_dims_); @@ -914,6 +914,9 @@ void RegisterReduce(OpRegistryBase *op_registry) { .SetDevicePlacerFunc( [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); + if (op->output_shape_size() != op->output_size()) { + return { DeviceType::CPU, DeviceType::GPU }; + } bool keep_dims = ProtoArgHelper::GetOptionalArg( *op, "keepdims", false); @@ -923,7 +926,7 @@ void RegisterReduce(OpRegistryBase *op_registry) { auto axis = ProtoArgHelper::GetRepeatedArgs( *op, "axis"); - if (axis.size() != 2 || axis[0] != 1 || axis[1] == 2) { + if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) { return { DeviceType::CPU }; } auto tensor_shape_info = context->tensor_shape_info(); diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc index ccf38fea..21a2dc13 100644 --- a/mace/ops/reduce_test.cc +++ b/mace/ops/reduce_test.cc @@ -38,7 +38,8 @@ void Simple(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Reduce", "ReduceTest") .Input("InputNCHW") .AddIntsArg("axis", axis) @@ -49,7 +50,8 @@ void Simple(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("Reduce", "ReduceTest") .Input("Input") @@ -289,8 +291,8 @@ void RandomTest(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Reduce", "ReduceTest") .Input("InputNCHW") .AddIntsArg("axis", axis) @@ -301,8 +303,8 @@ void RandomTest(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Reduce", "ReduceTest") .Input("Input") .AddIntsArg("axis", axis) @@ -353,7 +355,7 @@ void TestQuant(const std::vector &input_shape, net.AddRandomInput( "Input", input_shape, false, false); net.TransformDataFormat( - "Input", NHWC, "InputNCHW", NCHW); + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddRandomInput( "OutputNCHW", input_shape, false, true, true); @@ -368,7 +370,7 @@ void TestQuant(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); net.RunOp(CPU); net.TransformDataFormat( - "OutputNCHW", NCHW, "Output", NHWC); + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("Quantize", "QuantizeInput") .Input("Input") diff --git a/mace/ops/ref/deconv_2d.cc b/mace/ops/ref/deconv_2d.cc index 6044af3b..d06c6634 100644 --- a/mace/ops/ref/deconv_2d.cc +++ b/mace/ops/ref/deconv_2d.cc @@ -51,7 +51,7 @@ MaceStatus Deconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/ref/depthwise_deconv_2d.cc b/mace/ops/ref/depthwise_deconv_2d.cc index 0da81faa..63b3aa69 100644 --- a/mace/ops/ref/depthwise_deconv_2d.cc +++ b/mace/ops/ref/depthwise_deconv_2d.cc @@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); @@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d::Compute(const OpContext *context, &out_pad_size, &padded_out_shape, framework_type_, - NCHW); + DataFormat::NCHW); MACE_RETURN_IF_ERROR(output->Resize(out_shape)); diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index f06692b9..349f6423 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -212,7 +212,7 @@ class ResizeBicubicOp : public Operation { std::vector size = Operation::GetRepeatedArgs( "size", {-1, -1}); MACE_CHECK(size.size() == 2); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners, size[0], size[1]); } else { diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 035ddfcf..e9c5e4d1 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { std::vector input(48); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 4, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 2, 3, 3}, @@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -133,8 +133,8 @@ void TestRandomResizeBicubic() { net.AddRandomInput("Input", {batch, in_height, in_width, channels}, false, true, true); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") .Input("InputNCHW") @@ -144,8 +144,8 @@ void TestRandomResizeBicubic() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); Tensor expected; expected.Copy(*net.GetOutput("Output")); diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 1fe13f42..09df62d8 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -346,7 +346,7 @@ class ResizeBilinearOp : public Operation { std::vector size = Operation::GetRepeatedArgs( "size", {-1, -1}); MACE_CHECK(size.size() == 2); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners, size[0], size[1]); } else { diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 9252e81f..c9c86427 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { std::vector input(24); std::iota(begin(input), end(input), 0); net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -100,8 +100,8 @@ void TestRandomResizeBilinear() { // Add input data net.AddRandomInput("Input", {batch, in_height, in_width, channels}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -111,8 +111,8 @@ void TestRandomResizeBilinear() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); @@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() { true, -1.f, 1.f); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputNCHW") @@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") diff --git a/mace/ops/resize_nearest_neighbor.cc b/mace/ops/resize_nearest_neighbor.cc index 8840458f..9e98e75e 100644 --- a/mace/ops/resize_nearest_neighbor.cc +++ b/mace/ops/resize_nearest_neighbor.cc @@ -149,7 +149,7 @@ class ResizeNearestNeighborOp : public Operation { : Operation(context) { bool align_corners = Operation::GetOptionalArg( "align_corners", false); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>( align_corners); } else { diff --git a/mace/ops/resize_nearest_neighbor_test.cc b/mace/ops/resize_nearest_neighbor_test.cc index b9500472..842c44c6 100644 --- a/mace/ops/resize_nearest_neighbor_test.cc +++ b/mace/ops/resize_nearest_neighbor_test.cc @@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) { std::iota(begin(input), end(input), 0); std::vector size = {1, 2}; net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") @@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8}); @@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) { std::iota(begin(input), end(input), 0); std::vector size = {1, 2}; net.AddInputFromArray("Input", {1, 2, 4, 3}, input); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") @@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) { // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); // Check auto expected = net.CreateTensor({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11}); @@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() { std::vector size = {20, 40}; net.AddRandomInput("Input", {batch, in_height, in_width, channels}); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); net.AddInputFromArray("Size", {2}, size); OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest") .Input("InputNCHW") @@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() { .Finalize(net.NewOperatorDef()); // Run on CPU net.RunOp(DeviceType::CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index d5fcbc02..e3241098 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -414,7 +414,7 @@ class SoftmaxOp : public Operation { : Operation(context) { bool use_log = ( Operation::GetOptionalArg("use_log", false)); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(use_log); } else { kernel_ = make_unique>(use_log); diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index ab818ac8..eb3398db 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -50,7 +50,8 @@ void Simple(bool use_log = false) { if (D == DeviceType::CPU) { // test 4d softmax - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -59,7 +60,8 @@ void Simple(bool use_log = false) { // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); @@ -109,7 +111,8 @@ void Complex(const std::vector &logits_shape, net.AddRandomInput("Input", logits_shape); if (logits_shape.size() == 4) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") @@ -127,7 +130,8 @@ void Complex(const std::vector &logits_shape, net.RunOp(); if (logits_shape.size() == 4) { - net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } auto expected = net.CreateTensor(); diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index b239193c..50de3fc7 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -307,7 +307,7 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { public: explicit SpaceToBatchNDOp(OpConstructContext *context) : SpaceToBatchOpBase(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 95b9fafc..045d6ece 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); } else if (D == CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check ExpectTensorNear(*expected, *net.GetOutput("Output")); @@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); } else if (D == CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector &input_shape, net.RunOp(D); if (D == CPU) { - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } // Check ExpectTensorNear(*expected, *net.GetOutput("Output")); @@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector &input_shape, net.RunOp(GPU); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // Check ExpectTensorNear(*net.GetOutput("OutputCPU"), @@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, net.RunOp(GPU); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // Check ExpectTensorNear(*net.GetOutput("OutputCPU"), @@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector &input_shape, 1.f); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") @@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector &input_shape, 1.f); // run cpu - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector &input_shape, .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(CPU); - net.TransformDataFormat("OutputNCHW", NCHW, - "OutputCPU", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC); // run quantize OpDefBuilder("Quantize", "QuantizeInput") diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc index 918ae678..9584ddb8 100644 --- a/mace/ops/space_to_depth.cc +++ b/mace/ops/space_to_depth.cc @@ -94,7 +94,7 @@ class SpaceToDepthOp : public Operation { explicit SpaceToDepthOp(OpConstructContext *context) : Operation(context) { int block_size = Operation::GetOptionalArg("block_size", 1); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(block_size); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc index 23daaa55..6d023b88 100644 --- a/mace/ops/space_to_depth_test.cc +++ b/mace/ops/space_to_depth_test.cc @@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); // Construct graph if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("InputNCHW") .Output("OutputNCHW") @@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); } else { OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") @@ -107,8 +107,8 @@ void RandomTest(const int block_size, // Add input data net.AddRandomInput("Input", shape); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("InputNCHW") .AddIntArg("block_size", block_size) @@ -118,8 +118,8 @@ void RandomTest(const int block_size, // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") .Input("Input") diff --git a/mace/ops/split.cc b/mace/ops/split.cc index 6b646270..b08d72c5 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -106,7 +106,7 @@ class SplitOp : public Operation { explicit SplitOp(OpConstructContext *context) : Operation(context) { int32_t axis = Operation::GetOptionalArg("axis", 3); - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(axis); } else { MACE_NOT_IMPLEMENTED; @@ -147,7 +147,7 @@ void RegisterSplit(OpRegistryBase *op_registry) { [](OpConditionContext *context) -> std::set { auto op = context->operator_def(); if (op->output_shape_size() != op->output_size()) { - return { DeviceType::CPU }; + return {DeviceType::CPU, DeviceType::GPU}; } int axis = ProtoArgHelper::GetOptionalArg( *op, "axis", 3); diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index d58191c4..cd2fb174 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -83,7 +83,7 @@ class SqrDiffMeanOp : public Operation { public: explicit SqrDiffMeanOp(OpConstructContext *context) : Operation(context) { - if (context->device()->gpu_runtime()->UseImageMemory()) { + if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) { kernel_ = make_unique>(); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc index 34257479..3257987c 100644 --- a/mace/ops/sqrdiff_mean_test.cc +++ b/mace/ops/sqrdiff_mean_test.cc @@ -36,13 +36,13 @@ void Simple(const std::vector &input_shape0, net.AddInputFromArray("Input1", input_shape1, input1); net.TransformDataFormat("Input0", - NHWC, + DataFormat::NHWC, "InputNCHW0", - NCHW); + DataFormat::NCHW); net.TransformDataFormat("Input1", - NHWC, + DataFormat::NHWC, "InputNCHW1", - NCHW); + DataFormat::NCHW); if (D == DeviceType::CPU) { OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") @@ -54,9 +54,9 @@ void Simple(const std::vector &input_shape0, net.RunOp(D); net.TransformDataFormat("OutputNCHW", - NCHW, + DataFormat::NCHW, "Output", - NHWC); + DataFormat::NHWC); } else { OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("Input0") @@ -107,10 +107,10 @@ void RandomTest(const std::vector &input_shape0, net.AddRandomInput("Input0", input_shape0); net.AddRandomInput("Input1", input_shape1); - net.TransformDataFormat("Input0", NHWC, "InputNCHW0", - NCHW); - net.TransformDataFormat("Input1", NHWC, "InputNCHW1", - NCHW); + net.TransformDataFormat( + "Input0", DataFormat::NHWC, "InputNCHW0", DataFormat::NCHW); + net.TransformDataFormat( + "Input1", DataFormat::NHWC, "InputNCHW1", DataFormat::NCHW); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("InputNCHW0") .Input("InputNCHW1") @@ -118,8 +118,8 @@ void RandomTest(const std::vector &input_shape0, .Finalize(net.NewOperatorDef()); // Run net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, - "Output", NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") .Input("Input0") .Input("Input1") diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc index 8b085fe5..f8dd06f5 100644 --- a/mace/ops/strided_slice_test.cc +++ b/mace/ops/strided_slice_test.cc @@ -86,8 +86,8 @@ void TestStridedSliceWithDataFormat(const std::vector &input_shape, net.AddInputFromArray( "Strides", {static_cast(strides.size())}, strides); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("StridedSlice", "StridedSliceOpTest") .Input("InputNCHW") @@ -105,8 +105,8 @@ void TestStridedSliceWithDataFormat(const std::vector &input_shape, net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", output_shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), *net.GetOutput("Output")); @@ -154,8 +154,8 @@ void TestSliceWithDataFormat(const std::vector &input_shape, net.AddInputFromArray( "IndicesSize", {static_cast(indices_size.size())}, indices_size); - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); + net.TransformDataFormat( + "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW); OpDefBuilder("StridedSlice", "StridedSliceOpTest") .Input("InputNCHW") @@ -168,8 +168,8 @@ void TestSliceWithDataFormat(const std::vector &input_shape, net.RunOp(); - net.TransformDataFormat("OutputNCHW", NCHW, "Output", - NHWC); + net.TransformDataFormat( + "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC); net.AddInputFromArray("ExpectedOutput", output_shape, output); ExpectTensorNear(*net.GetOutput("ExpectedOutput"), *net.GetOutput("Output")); diff --git a/mace/public/mace.h b/mace/public/mace.h index dd559249..72e96d1e 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -34,10 +34,10 @@ class NetDef; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 }; -enum DataFormat { - DF_NONE = 0, NHWC = 1, NCHW = 2, +enum class DataFormat { + NONE = 0, NHWC = 1, NCHW = 2, HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103, - DF_AUTO = 1000, + AUTO = 1000, }; enum GPUPerfHint { diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 446321a4..58658dd8 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -41,7 +41,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value, 'cpu+gpu': cvt.DeviceType.CPU.value} data_format_map = { - 'NONE': cvt.DataFormat.DF_NONE, + 'NONE': cvt.DataFormat.NONE, 'NHWC': cvt.DataFormat.NHWC, 'NCHW': cvt.DataFormat.NCHW, 'OIHW': cvt.DataFormat.OIHW, diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 8162f008..61e65bae 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -26,14 +26,14 @@ class DeviceType(Enum): class DataFormat(Enum): - DF_NONE = 0 + NONE = 0 NHWC = 1 NCHW = 2 HWIO = 100 OIHW = 101 HWOI = 102 OHWI = 103 - DF_AUTO = 1000 + AUTO = 1000 # SAME_LOWER: if the amount of paddings to be added is odd, @@ -598,8 +598,8 @@ class ConverterUtil(object): return DataFormat.NHWC elif arg.i == DataFormat.NCHW.value: return DataFormat.NCHW - elif arg.i == DataFormat.DF_AUTO.value: - return DataFormat.DF_AUTO + elif arg.i == DataFormat.AUTO.value: + return DataFormat.AUTO else: return None diff --git a/mace/python/tools/converter_tool/onnx_converter.py b/mace/python/tools/converter_tool/onnx_converter.py index 8974489c..70e855d5 100644 --- a/mace/python/tools/converter_tool/onnx_converter.py +++ b/mace/python/tools/converter_tool/onnx_converter.py @@ -387,7 +387,8 @@ class OnnxConverter(base_converter.ConverterInterface): self._mace_net_def = mace_pb2.NetDef() self._data_format = DataFormat.NCHW ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) - ConverterUtil.add_data_format_arg(self._mace_net_def, self._data_format) + ConverterUtil.add_data_format_arg(self._mace_net_def, + self._data_format) onnx_model = onnx.load(src_model_file) ir_version = onnx_model.ir_version @@ -403,7 +404,7 @@ class OnnxConverter(base_converter.ConverterInterface): print("constains ops domain: ", domain, "version:", version) if 'kaldi2onnx' in domain: polish_available = False - self._data_format = DataFormat.DF_NONE + self._data_format = DataFormat.NONE self._isKaldi = True if polish_available: onnx_model = onnx.utils.polish_model(onnx_model) diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 65c456c9..51806961 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -27,7 +27,7 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp -from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps +from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps # noqa from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps # noqa from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import ReduceType @@ -200,15 +200,15 @@ class Transformer(base_converter.ConverterInterface): op.output.extend([input_node.name]) output_shape = op.output_shape.add() output_shape.dims.extend(input_node.shape) - if input_node.data_format != DataFormat.DF_NONE: + if input_node.data_format != DataFormat.NONE: if input_node.data_format == DataFormat.NCHW: self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) ConverterUtil.add_data_format_arg(op, - DataFormat.DF_AUTO) + DataFormat.AUTO) else: ConverterUtil.add_data_format_arg(op, - DataFormat.DF_NONE) + DataFormat.NONE) self._producer[op.output[0]] = op @staticmethod @@ -261,7 +261,7 @@ class Transformer(base_converter.ConverterInterface): producer = self._producer[tensor] return ConverterUtil.data_format(producer) else: - return DataFormat.DF_NONE + return DataFormat.NONE def consumer_count(self, tensor_name): return len(self._consumers.get(tensor_name, [])) @@ -1021,7 +1021,6 @@ class Transformer(base_converter.ConverterInterface): filter_format.name) return False - def add_winograd_arg(self): if self._wino_arg == 0: return False @@ -1350,20 +1349,21 @@ class Transformer(base_converter.ConverterInterface): df_arg = op.arg.add() df_arg.name = MaceKeyword.mace_data_format_str if op.type in MaceHasDataFormatOps: - df_arg.i = DataFormat.DF_AUTO.value + df_arg.i = DataFormat.AUTO.value elif op.type in MaceMayHasDataFormatOps: - input_df = DataFormat.DF_AUTO.value + input_df = DataFormat.AUTO.value for input_tensor in op.input: if input_tensor in self._consts: continue - mace_check(input_tensor in self._producer, - "Input tensor %s not in producer" % input_tensor) + mace_check( + input_tensor in self._producer, + "Input tensor %s not in producer" % input_tensor) father_op = self._producer[input_tensor] temp_input_df = ConverterUtil.get_arg( father_op, MaceKeyword.mace_data_format_str) - if temp_input_df.i != DataFormat.DF_AUTO.value: + if temp_input_df.i != DataFormat.AUTO.value: input_df = temp_input_df.i - if input_df == DataFormat.DF_AUTO.value: + if input_df == DataFormat.AUTO.value: df_arg.i = input_df # add flag to mark the ops may has data format has_data_format_arg = op.arg.add() @@ -1379,7 +1379,7 @@ class Transformer(base_converter.ConverterInterface): src_data_format = ConverterUtil.data_format(net) for op in net.op: has_data_format = ConverterUtil.data_format(op) == \ - DataFormat.DF_AUTO + DataFormat.AUTO # transpose args if op.type == MaceOp.Pad.name: for arg in op.arg: diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 89bee8d8..0d1396c4 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -80,7 +80,7 @@ void CreateInputInfo(NetDef *net_def) { input_info = net_def->add_input_info(); input_info->set_name({{ net.input_info[idx].name|tojson }}); input_info->set_data_type(static_cast({{ net.input_info[idx].data_type }})); - input_info->set_data_format(static_cast({{ net.input_info[idx].data_format }})); + input_info->set_data_format({{ net.input_info[idx].data_format }}); input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }}); {% for dim in net.input_info[idx].dims %} input_info->add_dims({{ dim }}); @@ -97,7 +97,7 @@ void CreateOutputInfo(NetDef *net_def) { output_info = net_def->add_output_info(); output_info->set_name({{ net.output_info[idx].name|tojson }}); output_info->set_data_type(static_cast({{ net.output_info[idx].data_type }})); - output_info->set_data_format(static_cast({{ net.output_info[idx].data_format }})); + output_info->set_data_format({{ net.output_info[idx].data_format }}); output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }}); {% for dim in net.output_info[idx].dims %} output_info->add_dims({{dim}}); diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 4bf5f40b..a06ce493 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -48,7 +48,7 @@ void MaceRunFunc(const int in_out_size) { for (size_t i = 0; i < input_names.size(); ++i) { InputOutputInfo *info = net_def->add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names[i]); for (auto d : input_shapes[0]) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 0a852a17..6cad55b9 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -45,7 +45,7 @@ void MaceRun(const int in_out_size, for (size_t i = 0; i < input_names.size(); ++i) { InputOutputInfo *info = net_def->add_input_info(); - info->set_data_format(DataFormat::NHWC); + info->set_data_format(static_cast(DataFormat::NHWC)); info->set_name(input_names[i]); for (auto d : max_shape) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h index 9cc1402f..faaf1443 100644 --- a/mace/test/mace_api_test.h +++ b/mace/test/mace_api_test.h @@ -76,7 +76,7 @@ void Conv3x3(const std::string &input_name, .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("has_data_format", 1) + .AddIntArg("data_format", static_cast(DataFormat::AUTO)) .Finalize(&operator_def); OutputShape *shape = operator_def.add_output_shape(); @@ -99,7 +99,7 @@ void Relu(const std::string &input_name, .AddStringArg("activation", "RELU") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddIntArg("device", static_cast(device_type)) - .AddIntArg("has_data_format", 1) + .AddIntArg("data_format", static_cast(DataFormat::AUTO)) .Finalize(&operator_def); net_def->add_op()->CopyFrom(operator_def); @@ -139,7 +139,8 @@ void CheckOutputs(const NetDef &net_def, if (D == DeviceType::CPU) { std::string input_name = input.first + "NHWC"; net.AddInputFromArray(input_name, input_shape, input_data); - net.TransformDataFormat(input_name, NHWC, input.first, NCHW); + net.TransformDataFormat( + input_name, DataFormat::NHWC, input.first, DataFormat::NCHW); } else { net.AddInputFromArray(input.first, input_shape, input_data); } @@ -154,7 +155,7 @@ void CheckOutputs(const NetDef &net_def, memcpy(data.data(), reinterpret_cast(tensor_data.data()) + tensor.offset(), tensor.data_size() * sizeof(T)); - net.AddInputFromArray(tensor.name(), shape, data); + net.AddInputFromArray(tensor.name(), shape, data, true); } net.RunNet(net_def, D); @@ -175,9 +176,9 @@ void CheckOutputs(const NetDef &net_def, if (D == DeviceType::CPU) { output_name = output.first + "NHWC"; net.TransformDataFormat(output.first, - NCHW, + DataFormat::NCHW, output_name, - NHWC); + DataFormat::NHWC); } ops::test::ExpectTensorNear(*tmp_tensor, *net.GetOutput(output_name.data()), diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 7fc0690d..fca4a0fd 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -91,7 +91,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) { } else if (data_format_str == "OIHW") { return DataFormat::OIHW; } else { - return DataFormat::DF_NONE; + return DataFormat::NONE; } } -- GitLab