diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 4bd44ada514baf095cdb4bdfb6520808a285172c..666840e9fe31877d04b7197871775a7b3ed0f3c6 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -90,6 +90,16 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else { + return DataFormat::DF_NONE; + } +} + bool RunInference(MaceEngine *engine, const std::map &input_infos, std::map *output_infos, @@ -168,6 +178,12 @@ DEFINE_string(output_node, "output_node0,output_node1", "output nodes, separated by comma"); DEFINE_string(input_shape, "", "input shape, separated by colon and comma"); DEFINE_string(output_shape, "", "output shape, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name"); DEFINE_int32(max_num_runs, 100, "max number of runs"); DEFINE_double(max_seconds, 10.0, "max number of seconds to run"); @@ -233,6 +249,19 @@ int Main(int argc, char **argv) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + str_util::Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + str_util::Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + mace::DeviceType device_type = ParseDeviceType(FLAGS_device); // configuration @@ -333,7 +362,8 @@ int Main(int argc, char **argv) { LOG(INFO) << "Open input file failed"; return -1; } - inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -344,7 +374,8 @@ int Main(int argc, char **argv) { auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], - buffer_out); + buffer_out, + output_data_formats[i]); } int64_t warmup_time_us = 0; diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 39020384a003d9bee4f9700869c9068c3e9e7d15..0b448ac97c508cbc4786823d8318750db3e683d0 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -115,6 +115,8 @@ void MemoryOptimizer::Optimize( op_def->output_type_size()); DataType dt; + bool has_data_format = ProtoArgHelper::GetOptionalArg( + *op_def, "has_data_format", 0) != 0; int output_size = op_def->output_size(); for (int i = 0; i < output_size; ++i) { if (i < op_def->output_type_size()) { @@ -134,7 +136,7 @@ void MemoryOptimizer::Optimize( MemoryBlock best_mem_block; if (IsMemoryReuseOp(op_def->type())) { if (tensor_mem_map_.count(op_def->input(0)) == 1) { - best_mem_id = tensor_mem_map_[op_def->input(0)].first; + best_mem_id = tensor_mem_map_.at(op_def->input(0)).mem_id; } } else { auto shape = std::vector( @@ -204,7 +206,8 @@ void MemoryOptimizer::Optimize( } else { mem_ref_count_[best_mem_id] = 1; } - tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt); + tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id, + dt, has_data_format)); } } @@ -216,7 +219,7 @@ void MemoryOptimizer::Optimize( tensor_ref_count_[input_name] -= 1; if (tensor_ref_count_.at(input_name) == 0 && tensor_mem_map_.count(input_name) == 1) { - int mem_id = tensor_mem_map_.at(input_name).first; + int mem_id = tensor_mem_map_.at(input_name).mem_id; mem_ref_count_[mem_id] -= 1; if (mem_ref_count_.at(mem_id) == 0) { idle_blocks_.insert(mem_id); @@ -236,7 +239,7 @@ const std::vector& MemoryOptimizer::mem_blocks() const { return mem_blocks_; } -const std::unordered_map>& +const std::unordered_map& MemoryOptimizer::tensor_mem_map() const { return tensor_mem_map_; } diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h index 555613e6a2043a47289bab0d8a44c282097bafc8..2b05aa01e4a7944dc3d53c92d34735211203746e 100644 --- a/mace/core/memory_optimizer.h +++ b/mace/core/memory_optimizer.h @@ -77,6 +77,17 @@ class MemoryBlock { }; class MemoryOptimizer { + public: + struct TensorMemInfo { + int mem_id; + DataType data_type; + bool has_data_format; + + TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) : + mem_id(mem_id), data_type(data_type), has_data_format(has_data_format) + {} + }; + public: static bool IsMemoryReuseOp(const std::string &op_type); void UpdateTensorRef(const std::string &tensor_name); @@ -86,8 +97,7 @@ class MemoryOptimizer { const std::vector &mem_blocks() const; - const std::unordered_map> &tensor_mem_map() const; + const std::unordered_map &tensor_mem_map() const; std::string DebugInfo() const; @@ -101,7 +111,7 @@ class MemoryOptimizer { std::vector mem_blocks_; // tensor name : // Buffer Memory do not different data type, so store the data type. - std::unordered_map> tensor_mem_map_; + std::unordered_map tensor_mem_map_; std::unordered_map mem_ref_count_; std::set idle_blocks_; }; diff --git a/mace/core/net.cc b/mace/core/net.cc index 5ff777b0607715ac5caa9a3beb40c17841b00d3a..4bcbbb70e5378ff1dd76620e1dedcc61831dd48d 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -70,7 +70,7 @@ std::unique_ptr SerialNet::CreateOperation( const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - DataFormat data_format_flag, + bool has_data_format, bool is_quantize_model) { // Create the Operation DeviceType target_device_type = target_device_->device_type(); @@ -100,8 +100,7 @@ std::unique_ptr SerialNet::CreateOperation( if (!is_quantize_model && device_type == DeviceType::CPU && op_def->output_shape_size() == op_def->output_size()) { for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { - if (data_format_flag == NHWC && - op_def->output_shape(out_idx).dims_size() == 4) { + if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) { // NHWC -> NCHW std::vector output_shape = TransposeShape( @@ -160,7 +159,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, tensor.dims().end())); } - DataFormat data_format_flag = NHWC; + bool has_data_format = false; if (target_device_->device_type() == DeviceType::CPU) { target_mem_type = MemoryType::CPU_BUFFER; for (auto &input_info : net_def->input_info()) { @@ -170,15 +169,15 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, // update tensor shape map tensor_shape_map[input_info.name()] = input_shape; // Only could be NONE or NHWC - auto input_data_format = static_cast( + DataFormat input_data_format = static_cast( input_info.data_format()); - if (!is_quantize_model && input_data_format == NHWC && + has_data_format = has_data_format || + (input_data_format != DataFormat::DF_NONE); + if (!is_quantize_model && input_data_format == DataFormat::NHWC && input_info.dims_size() == 4) { // NHWC -> NCHW input_shape = TransposeShape(input_shape, {0, 3, 1, 2}); - } else if (input_data_format == DataFormat::DF_NONE) { - data_format_flag = DataFormat::DF_NONE; } output_map.emplace(input_info.name(), InternalOutputInfo( target_mem_type, DataType::DT_FLOAT, input_shape, -1)); @@ -189,11 +188,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, else { // GPU NOLINT[readability/braces] target_mem_type = MemoryType::GPU_BUFFER; for (auto &input_info : net_def->input_info()) { - auto input_data_format = static_cast( - input_info.data_format()); - if (input_data_format == DataFormat::DF_NONE) { - data_format_flag = DataFormat::DF_NONE; - } + has_data_format = static_cast( + input_info.data_format()) == NHWC; std::vector input_shape = std::vector(input_info.dims().begin(), input_info.dims().end()); @@ -212,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto op = CreateOperation(op_registry, &construct_context, op_def, - data_format_flag, + has_data_format, is_quantize_model); #ifdef MACE_ENABLE_OPENCL // Add input transform operation if necessary @@ -259,13 +255,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } auto transform_op_def = OpenCLUtil::CreateTransformOpDef( input_name, input_shape, t_input_name, - wanted_in_dt, wanted_in_mem_type, data_format_flag); + wanted_in_dt, wanted_in_mem_type, has_data_format); OpConstructContext t_construct_context(ws_); auto transform_op = CreateOperation( op_registry, &t_construct_context, transform_op_def, - data_format_flag); + has_data_format); operators_.emplace_back(std::move(transform_op)); transformed_set.insert(t_input_name); output_mem_map[t_input_name] = wanted_in_mem_type; @@ -340,7 +336,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, output_mem_map[output_info.name()] = target_mem_type; } } - auto output_data_format = + bool output_has_data_format = static_cast(output_info.data_format()); auto transform_op_def = OpenCLUtil::CreateTransformOpDef( t_output_name, @@ -348,12 +344,12 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, output_info.name(), output_info.data_type(), target_mem_type, - data_format_flag); + output_has_data_format); auto transform_op = CreateOperation( op_registry, &construct_context, transform_op_def, - output_data_format); + output_has_data_format); operators_.emplace_back(std::move(transform_op)); // where to do graph reference count. mem_optimizer->UpdateTensorRef(transform_op_def.get()); diff --git a/mace/core/net.h b/mace/core/net.h index 9945d04637d5eafa402297462b3e9adf1375abdd..cafd1a7fe46e47c64a79bd6e8b4208114062d889 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -59,7 +59,7 @@ class SerialNet : public NetBase { const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - DataFormat input_format, + bool has_data_format, bool is_quantize_model = false); protected: diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc index b190f05f4f258c27aabc0f209e271572257fb4f3..ff409e571bab3d92febd5cd8918c9ca4072d40cb 100644 --- a/mace/core/runtime/opencl/opencl_util.cc +++ b/mace/core/runtime/opencl/opencl_util.cc @@ -152,7 +152,7 @@ std::shared_ptr OpenCLUtil::CreateTransformOpDef( const std::string &output_name, const mace::DataType dt, const mace::MemoryType mem_type, - const DataFormat data_format) { + bool has_data_format) { std::unique_ptr op(new OperatorDef); std::string op_name = "mace_node_" + output_name; op->set_name(op_name); @@ -169,8 +169,8 @@ std::shared_ptr OpenCLUtil::CreateTransformOpDef( arg->set_name("T"); arg->set_i(static_cast(dt)); arg = op->add_arg(); - arg->set_name("data_format"); - arg->set_i(data_format); + arg->set_name("has_data_format"); + arg->set_i(has_data_format); if (!input_shape.empty()) { OutputShape *shape = op->add_output_shape(); for (auto value : input_shape) { diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h index ec399d87600dc9529c9d94f909ab6d45cd6f4a3e..cae05f831cc5198a3e1e88a1edfc1222e1f4a15f 100644 --- a/mace/core/runtime/opencl/opencl_util.h +++ b/mace/core/runtime/opencl/opencl_util.h @@ -49,7 +49,7 @@ class OpenCLUtil { const std::string &output_name, const mace::DataType dt, const MemoryType mem_type, - const DataFormat data_format); + bool has_data_format); }; } // namespace mace diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index eb10dc89bed268fc1bd8d5772e5acac551c90d0e..57844248d57137240382b64752dba309393b7065 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -264,31 +264,35 @@ MaceStatus Workspace::PreallocateOutputTensor( bool is_quantize_model = IsQuantizedModel(net_def); for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) { std::unique_ptr tensor - (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first), - tensor_mem.second.second, + (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id), + tensor_mem.second.data_type, false, tensor_mem.first)); - if (mem_blocks[tensor_mem.second.first].mem_type() - == MemoryType::GPU_IMAGE) { - VLOG(1) << "Tensor: " << tensor_mem.first - << " Mem: " << tensor_mem.second.first - << " Data type: " << tensor->dtype() - << " Image shape: " - << tensor->UnderlyingBuffer()->shape()[0] - << ", " - << tensor->UnderlyingBuffer()->shape()[1]; - tensor->set_data_format(DataFormat::NHWC); - } else { - VLOG(1) << "Tensor: " << tensor_mem.first - << " Mem: " << tensor_mem.second.first - << " Data type: " << tensor->dtype() - << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); - if (mem_blocks[tensor_mem.second.first].mem_type() - == MemoryType::GPU_BUFFER || - is_quantize_model) { + if (tensor_mem.second.has_data_format) { + if (mem_blocks[tensor_mem.second.mem_id].mem_type() + == MemoryType::GPU_IMAGE) { + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.mem_id + << " Data type: " << tensor->dtype() + << " Image shape: " + << tensor->UnderlyingBuffer()->shape()[0] + << ", " + << tensor->UnderlyingBuffer()->shape()[1]; tensor->set_data_format(DataFormat::NHWC); } else { - tensor->set_data_format(DataFormat::NCHW); + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.mem_id + << " Data type: " << tensor->dtype() + << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); + if (mem_blocks[tensor_mem.second.mem_id].mem_type() + == MemoryType::GPU_BUFFER || + is_quantize_model) { + tensor->set_data_format(DataFormat::NHWC); + } else { + tensor->set_data_format(DataFormat::NCHW); + } } + } else { + tensor->set_data_format(DataFormat::DF_NONE); } tensor_map_[tensor_mem.first] = std::move(tensor); } diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 97d3608116914423be21b05f02307b64a850eabd..8ab0b0d4a6852e37af8d2bf894e494d17792724b 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -170,6 +170,15 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else { + return DataFormat::DF_NONE; + } +} DEFINE_string(model_name, "", @@ -186,6 +195,12 @@ DEFINE_string(output_node, DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); @@ -222,8 +237,10 @@ DEFINE_int32(cpu_affinity_policy, 1, bool RunModel(const std::vector &input_names, const std::vector> &input_shapes, + const std::vector &input_data_formats, const std::vector &output_names, - const std::vector> &output_shapes) { + const std::vector> &output_shapes, + const std::vector &output_data_formats) { // load model DeviceType device_type = ParseDeviceType(FLAGS_device); // configuration @@ -324,7 +341,8 @@ bool RunModel(const std::vector &input_names, inputs_size[input_names[i]] = input_size; auto buffer_in = std::shared_ptr(new float[input_size], std::default_delete()); - inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -333,7 +351,8 @@ bool RunModel(const std::vector &input_names, std::multiplies()); auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); - outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, + output_data_formats[i]); } if (!FLAGS_input_dir.empty()) { @@ -485,11 +504,25 @@ int Main(int argc, char **argv) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + str_util::Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + str_util::Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + bool ret = false; for (int i = 0; i < FLAGS_restart_round; ++i) { std::cout << "restart round " << i << std::endl; ret = - RunModel(input_names, input_shape_vec, output_names, output_shape_vec); + RunModel(input_names, input_shape_vec, input_data_formats, + output_names, output_shape_vec, output_data_formats); } if (ret) { return 0; diff --git a/mace/libmace/capability.cc b/mace/libmace/capability.cc index 2989cbc16f8432842858af66e7682678d7a09f2f..c9dff5dc73782d6831a9b4a59d0e9aa22ada2e99 100644 --- a/mace/libmace/capability.cc +++ b/mace/libmace/capability.cc @@ -143,6 +143,7 @@ void BMNet::SetUp() { // Add input and output information for (size_t i = 0; i < input_names_.size(); ++i) { InputInfo *info = net_.add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names_[i]); for (auto d : input_shapes_[i]) { info->add_dims(static_cast(d)); @@ -243,8 +244,8 @@ void BMNet::AddConv(const std::string &conv_type, op_def->add_output(output_name); AddIntsArg(op_def, "strides", strides); AddIntArg(op_def, "padding", padding_type); + AddIntArg(op_def, "has_data_format", 1); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "data_format", 1); if (has_relu6) { AddStringArg(op_def, "activation", "RELUX"); AddFloatArg(op_def, "max_limit", 6); @@ -270,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name, op_def->add_output(output); AddIntArg(op_def, "type", type); AddIntArg(op_def, "T", DT_HALF); - AddIntArg(op_def, "data_format", 1); + AddIntArg(op_def, "has_data_format", 1); OutputShape *shape = op_def->add_output_shape(); for (auto dim : output_shape) { shape->add_dims(dim); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 63dcd7adc6ef326df1da9b95519aba036cf299a5..2d0acb450aadf9d60b285e4fb71a748c1870fa4f 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -470,6 +470,9 @@ MaceStatus MaceEngine::Impl::Init( shape[i] = input_info_map_[input_name].dims(i); } input_tensor->Resize(shape); + // Set to the default data format + input_tensor->set_data_format(static_cast( + input_info_map_[input_name].data_format())); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -477,7 +480,9 @@ MaceStatus MaceEngine::Impl::Init( << "' does not belong to model's outputs " << MakeString(MapKeys(output_info_map_)); } +#ifdef MACE_ENABLE_HEXAGON ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT); +#endif } #ifdef MACE_ENABLE_HEXAGON if (device_type_ == HEXAGON) { @@ -559,47 +564,51 @@ MaceEngine::Impl::~Impl() { MaceStatus MaceEngine::Impl::TransposeInput( const std::pair &input, Tensor *input_tensor) { - if (device_->device_type() == DeviceType::CPU && - input.second.shape().size() == 4 && - input.second.data_format() == NHWC && - !is_quantized_model_) { - VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; - input_tensor->set_data_format(DataFormat::NCHW); - std::vector dst_dims = {0, 3, 1, 2}; - std::vector output_shape = - TransposeShape(input.second.shape(), dst_dims); - MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); - } else if ( - (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && - input.second.shape().size() == 4 && - input.second.data_format() == DataFormat::NCHW) { - VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; - std::vector dst_dims = {0, 2, 3, 1}; - input_tensor->set_data_format(DataFormat::NHWC); - std::vector output_shape = - TransposeShape(input.second.shape(), dst_dims); - MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); - } else { - input_tensor->set_data_format(input.second.data_format()); - MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - memcpy(input_data, input.second.data().get(), - input_tensor->size() * sizeof(float)); - return MaceStatus::MACE_SUCCESS; + bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE; + DataFormat data_format = DataFormat::DF_NONE; + if (has_data_format) { + if (device_->device_type() == DeviceType::CPU && + input.second.shape().size() == 4 && + input.second.data_format() == NHWC && + !is_quantized_model_) { + VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; + input_tensor->set_data_format(DataFormat::NCHW); + std::vector dst_dims = {0, 3, 1, 2}; + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } else if ( + (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && + input.second.shape().size() == 4 && + input.second.data_format() == DataFormat::NCHW) { + VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; + std::vector dst_dims = {0, 2, 3, 1}; + input_tensor->set_data_format(DataFormat::NHWC); + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } + data_format = input.second.data_format(); } + input_tensor->set_data_format(data_format); + MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(float)); + return MaceStatus::MACE_SUCCESS; } MaceStatus MaceEngine::Impl::TransposeOutput( @@ -607,38 +616,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput( std::pair *output) { // save output if (output_tensor != nullptr && output->second.data() != nullptr) { - if (device_->device_type() == DeviceType::CPU && - output->second.shape().size() == 4 && - output->second.data_format() != output_tensor->data_format()) { - MACE_CHECK(output_tensor->data_format() == NCHW); - VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC"; - std::vector dst_dims = {0, 2, 3, 1}; - std::vector shape = - TransposeShape(output_tensor->shape(), - dst_dims); - int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1, - std::multiplies()); - MACE_CHECK(output_size <= output->second.impl_->buffer_size) - << "Output size exceeds buffer size: shape" - << MakeString(shape) << " vs buffer size " - << output->second.impl_->buffer_size; - output->second.impl_->shape = shape; - Tensor::MappingGuard output_guard(output_tensor); - const float *output_data = output_tensor->data(); - return ops::Transpose(output_data, - output_tensor->shape(), - dst_dims, - output->second.data().get()); - } else if (device_->device_type() == DeviceType::GPU && + if (output_tensor->data_format() != DataFormat::DF_NONE && + output->second.data_format() != DataFormat::DF_NONE && output->second.shape().size() == 4 && output->second.data_format() != output_tensor->data_format()) { VLOG(1) << "Transform output " << output->first << " from " << output_tensor->data_format() << " to " << output->second.data_format(); - std::vector dst_dims = {0, 3, 1, 2}; - if (output_tensor->data_format() == NCHW) { + std::vector dst_dims; + if (output_tensor->data_format() == NCHW && + output->second.data_format() == NHWC) { dst_dims = {0, 2, 3, 1}; + } else if (output_tensor->data_format() == NHWC && + output->second.data_format() == NCHW) { + dst_dims = {0, 3, 1, 2}; + } else { + LOG(FATAL) <<"Not supported output data format: " + << output->second.data_format() << " vs " + << output_tensor->data_format(); } + VLOG(1) << "Transform output " << output->first << " from " + << output_tensor->data_format() << " to " + << output->second.data_format(); std::vector shape = TransposeShape(output_tensor->shape(), dst_dims); diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index ca67c4fb14d825fde6c8c831733bc27eb01fb8f7..a8883e1431205f46e5abbb2a78f4b45d8537cec7 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -35,8 +35,8 @@ class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", NHWC))) {} + has_data_format_(Operation::GetOptionalArg("has_data_format", 0)) + {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -57,7 +57,7 @@ class BiasAddOp : public Operation { const float *bias_ptr = bias->data(); float *output_ptr = output->mutable_data(); - if (input->dim_size() == 4 && data_format_ == NCHW) { + if (input->dim_size() == 4 && has_data_format_) { const index_t batch = input->dim(0); const index_t channels = input->dim(1); const index_t height_width = input->dim(2) * input->dim(3); @@ -90,7 +90,7 @@ class BiasAddOp : public Operation { } private: - DataFormat data_format_; + int has_data_format_; }; #ifdef MACE_ENABLE_OPENCL @@ -99,8 +99,7 @@ class BiasAddOp : public Operation { public: explicit BiasAddOp(OpConstructContext *context) : Operation(context), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", NHWC))) { + has_data_format_(Operation::GetOptionalArg("has_data_format", 1)) { MemoryType mem_type; if (context->device()->gpu_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; @@ -121,13 +120,13 @@ class BiasAddOp : public Operation { Tensor *output = this->Output(0); MACE_RETURN_IF_ERROR(output->ResizeLike(input)); - MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC, + MACE_CHECK(input->dim_size() == 4 && has_data_format_, "gpu only support biasadd for 4-dimensional NHWC format tensor"); return kernel_->Compute(context, input, bias, output); } private: - DataFormat data_format_; + int has_data_format_; std::unique_ptr kernel_; }; #endif // MACE_ENABLE_OPENCL diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 920a478f7202d6af7bef000ea4693cc8aa67c292..7de89dd2296829390eb1964911af5378c6edf9cc 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -42,7 +42,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { OpDefBuilder("BiasAdd", "BiasAddBM") .Input("Input") .Input("Bias") - .AddIntArg("data_format", data_format) + .AddIntArg("has_data_format", 1) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index 92b918592f984692ccaed7744bb4f4cc9fb3a17e..2e4764cac8ad2cf1f303a2e53c64fda444023fa3 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -36,7 +36,7 @@ void BiasAddSimple() { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); @@ -139,7 +139,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("InputNCHW") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index 15f6e7d323e7885f779a015d99403e9ed7fc6f2d..229d4eb9657432f7966368da759cb0b497972ee9 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -39,14 +39,14 @@ class BufferTransformOp : public Operation { auto type = static_cast(Operation::GetOptionalArg( "buffer_type", static_cast(CONV2D_FILTER))); - auto data_format = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); + bool has_data_format = Operation::GetOptionalArg("has_data_format", 0) + != 0; MemoryType in_mem_type = context->workspace()->GetTensor( operator_def_->input(0))->memory_type(); return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( context, input, type, out_mem_type_, wino_blk_size_, - data_format, output); + has_data_format, output); } private: diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index b0785ac0d1304808ea91c0875679bbcef8e280ad..6b2ac58a23e3ebbcb59e72300b682cd809263cca 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -60,9 +60,9 @@ class ConcatOp : public ConcatOpBase { MACE_UNUSED(context); if (!checked_) { Validate(); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_ == 3) axis_ = 1; else if (axis_ == 2) axis_ = 3; else if (axis_ == 1) axis_ = 2; @@ -251,9 +251,12 @@ void RegisterConcat(OpRegistryBase *op_registry) { if (op->output_shape(0).dims_size() != 4) { return { DeviceType::CPU }; } else { + int has_data_format = + ProtoArgHelper::GetOptionalArg( + *op, "has_data_format", 0); int axis = ProtoArgHelper::GetOptionalArg( *op, "axis", 3); - if (axis != 3) { + if (!has_data_format || axis != 3) { return { DeviceType::CPU }; } bool divisible_four = true; diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 88061a7b19804b9fda948bdc7c556fd2b81638fa..22eb544f96f15465177170868bdf4e68bcf46ab4 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -91,6 +91,7 @@ void OpenCLConcatHelper(int iters, .Input("Input0") .Input("Input1") .AddIntArg("axis", concat_dim) + .AddIntArg("has_data_format", 1) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index fc57920b7fe7a7e3ca2d4aca8bb7fd80a2d76aa7..bc41b11e394835e22ad3670d49e67781ec4ea372 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -100,11 +100,12 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) { } } -TEST_F(ConcatOpTest, CPURandom) { +namespace { +void CPURandomTest(int input_dim, int has_data_format) { static unsigned int seed = time(NULL); - int dim = 5; + int dim = input_dim; int num_inputs = 2 + rand_r(&seed) % 10; - int axis = 1; + int axis = 3; // Construct graph OpsTestNet net; auto builder = OpDefBuilder("Concat", "ConcatTest"); @@ -112,9 +113,13 @@ TEST_F(ConcatOpTest, CPURandom) { builder = builder.Input(MakeString("Input", i)); } builder.AddIntArg("axis", axis) + .AddIntArg("has_data_format", has_data_format) .Output("Output") .Finalize(net.NewOperatorDef()); + if (has_data_format) { + axis = 1; + } std::vector shape_data; GenerateRandomIntTypeData({dim}, &shape_data, 1, dim); std::vector> input_shapes(num_inputs, shape_data); @@ -152,6 +157,13 @@ TEST_F(ConcatOpTest, CPURandom) { } } } +} // namespace + +TEST_F(ConcatOpTest, CPURandom) { + CPURandomTest(5, 0); + CPURandomTest(4, 0); + CPURandomTest(4, 1); +} TEST_F(ConcatOpTest, QuantizedCPURandom) { static unsigned int seed = time(NULL); @@ -186,7 +198,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { builder = builder.Input(MakeString("Input", i)); } builder.AddIntArg("axis", axis_arg) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -248,7 +260,7 @@ namespace { template void OpenCLRandomTest(const std::vector> &shapes, const int axis, - DataFormat data_format) { + bool has_data_format) { srand(time(nullptr)); int num_inputs = shapes.size(); int concat_axis_size = 0; @@ -275,7 +287,7 @@ void OpenCLRandomTest(const std::vector> &shapes, builder.AddIntArg("axis", axis) .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("data_format", data_format) + .AddIntArg("has_data_format", has_data_format) .OutputShape(expected_shape) .Finalize(net.NewOperatorDef()); @@ -309,38 +321,37 @@ void OpenCLRandomTest(const std::vector> &shapes, } // namespace TEST_F(ConcatOpTest, OPENCLAligned) { - OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLHalfAligned) { - OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 32}, {3, 32, 32, 64}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLUnAligned) { - OpenCLRandomTest({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, - DataFormat::NHWC); + OpenCLRandomTest({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3, 1); } TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) { OpenCLRandomTest( {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, - 3, DataFormat::NHWC); + 3, 1); } TEST_F(ConcatOpTest, GPUFallbackToCPU2DInput) { - OpenCLRandomTest({{3, 4}, {3, 4}}, 1, DataFormat::DF_NONE); + OpenCLRandomTest({{3, 4}, {3, 4}}, 1, 0); } TEST_F(ConcatOpTest, GPUFallbackToCPUChanNotDivisibleBy4) { - OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, - DataFormat::DF_NONE); + OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 3, 0); +} + +TEST_F(ConcatOpTest, GPUFallbackToCPUNoDataFormat) { + OpenCLRandomTest({{1, 1, 4, 4}, {1, 1, 4, 4}}, 3, 0); } TEST_F(ConcatOpTest, GPUFallbackToCPUAxis2) { - OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, - DataFormat::DF_NONE); + OpenCLRandomTest({{1, 1, 4, 3}, {1, 1, 4, 3}}, 2, 0); } } // namespace test diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index e75c48e6d5ac7af3e8a674cbda2eab7df53aae2c..92864ae1016fad410ce054887babd09ee2557c59 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -897,8 +897,8 @@ class EltwiseOp : public Operation { scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( "scalar_input_index", 1)), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", 0))) {} + has_data_format_(Operation::GetOptionalArg( + "has_data_format", 0)) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -940,7 +940,7 @@ class EltwiseOp : public Operation { // check if we can broadcast tensor uint32_t rank_diff = static_cast(input0->dim_size() - input1->dim_size()); - if (data_format_ == NCHW) { + if (has_data_format_) { MACE_CHECK( (input0->dim_size() == 4) && ((input1->dim_size() == 0) || @@ -965,7 +965,7 @@ class EltwiseOp : public Operation { const T *input0_ptr = input0->data(); const T *input1_ptr = input1->data(); - if (data_format_ == NCHW && input1->dim_size() > 0) { + if (has_data_format_ && input1->dim_size() > 0) { MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); Tensor::MappingGuard output_guard(output); DstType *output_ptr = output->mutable_data(); @@ -1027,7 +1027,7 @@ class EltwiseOp : public Operation { std::vector coeff_; float scalar_input_; int32_t scalar_input_index_; - DataFormat data_format_; + int has_data_format_; Tensor scalar_tensor_; }; @@ -1042,9 +1042,7 @@ class EltwiseOp : public Operation { coeff_(Operation::GetRepeatedArgs("coeff")), scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( - "scalar_input_index", 1)), - data_format_(static_cast(Operation::GetOptionalArg( - "data_format", 0))) + "scalar_input_index", 1)) #ifdef MACE_ENABLE_NEON , eltwise_(static_cast(Operation::GetOptionalArg( "type", static_cast(ops::EltwiseType::NONE)))) @@ -1139,7 +1137,6 @@ class EltwiseOp : public Operation { std::vector coeff_; float scalar_input_; int32_t scalar_input_index_; - DataFormat data_format_; Tensor scalar_tensor_; #ifdef MACE_ENABLE_NEON arm::q8::Eltwise eltwise_; diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index cb239f53e0c01d79dc718d5b3d4eca636b187863..a1959e9df5c388dd6a3605538e83558f3d4e563d 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -44,6 +44,7 @@ void EltwiseBenchmark( .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", {1.2, 2.1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index 4f18810e73213c6113cca236d39e215731435983..58306b625a5ce8e38b0b129c230a4401d3a06ae9 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -75,7 +75,7 @@ void SimpleTensorScalar(const ops::EltwiseType type, .AddIntArg("T", DataTypeToEnum::v()) .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", x) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -120,7 +120,7 @@ void SimpleTensorEltwise(const ops::EltwiseType type, .AddIntArg("T", DataTypeToEnum::v()) .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) .Output("TOutput"); if (shape0.size() > 1) { @@ -642,7 +642,7 @@ void RandomTensorScalar(const ops::EltwiseType type, .Input("TInput") .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", 0.1) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); // Run @@ -699,7 +699,7 @@ void RandomTensorEltwise(const ops::EltwiseType type, .Input("TInput1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); @@ -755,7 +755,7 @@ void Quantized(const std::vector &shape, .Input("TInput0") .Input("TInput1") .AddIntArg("type", static_cast(type)) - .AddIntArg("data_format", DataFormat::NCHW) + .AddIntArg("has_data_format", 1) .Output("TOutput") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc index cd0d96b8cef49abad2e97cd60a81619065d51ebb..50b9a9e13022f31d36874c4f1b3e435b84e25616 100644 --- a/mace/ops/infer_conv2d_shape.cc +++ b/mace/ops/infer_conv2d_shape.cc @@ -34,9 +34,9 @@ class InferConv2dShapeOp : public Operation { Tensor::MappingGuard output_guard(output); int32_t *output_data = output->mutable_data(); - const int32_t data_format = - Operation::GetOptionalArg("data_format", 0); - const bool isNCHW = data_format == 1; + auto has_data_format = + Operation::GetOptionalArg("has_data_format", 0); + const bool isNCHW = (has_data_format && D == DeviceType::CPU); Padding padding_type = static_cast(Operation::GetOptionalArg( diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc index feaaecff8364d9f1a3270105bc03ddb36e3f5be2..333baaf944b34d8e2e0d78cd4e3d84aefa950163 100644 --- a/mace/ops/infer_conv2d_shape_test.cc +++ b/mace/ops/infer_conv2d_shape_test.cc @@ -57,8 +57,8 @@ void TestInferConv2dShapeOp(const std::vector &input_shape, } // namespace TEST_F(InferConv2dShapeOpTest, TestInferConv2dShape) { -TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3}); -TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3}); + TestInferConv2dShapeOp({3, 640, 480, 16}, 1, {3, 640, 480, 3}); + TestInferConv2dShapeOp({3, 640, 480, 16}, 2, {3, 320, 240, 3}); } } // namespace test diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index 954b31f9c676ec61bf7db08caea3d577833478e6..dbb6eab64c22f2941c2710f6a2730a527149f6c3 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -48,7 +48,7 @@ class OpenCLBufferTransformer { const OpenCLBufferType type, const MemoryType out_mem_type, const int wino_blk_size, - const DataFormat data_format, + bool has_data_format, Tensor *output) { Workspace *ws = context->workspace(); DataType dt = DataTypeToEnum::value; @@ -67,13 +67,14 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform CPU Buffer " << input->name() << " to GPU Buffer " << internal_tensor->name() << " with data type " << dt; - if (data_format == DataFormat::NHWC && input->shape().size() == 4) { + if (has_data_format && input->shape().size() == 4) { // 1. (NCHW -> NHWC) std::vector dst_dims = {0, 2, 3, 1}; std::vector output_shape = TransposeShape(input->shape(), dst_dims); internal_tensor->Resize(output_shape); + internal_tensor->set_data_format(DataFormat::NHWC); // TODO(liuqi): Only support float now const float *input_ptr = input->data(); Tensor::MappingGuard guard(internal_tensor); @@ -105,13 +106,13 @@ class OpenCLBufferTransformer { VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() << " to CPU Buffer " << output->name() << " with data type " << dt; - if (data_format == DataFormat::NHWC && - internal_tensor.shape().size() == 4) { + if (has_data_format && internal_tensor.shape().size() == 4) { // NHWC -> NCHW std::vector dst_dims = {0, 3, 1, 2}; std::vector output_shape = TransposeShape(internal_tensor.shape(), dst_dims); + output->set_data_format(DataFormat::NCHW); Tensor::MappingGuard guard(&internal_tensor); const float *internal_ptr = internal_tensor.data(); output->Resize(output_shape); diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index 79139bcf27775ef4d4d4124ab7f60eb9b54aac30..7462548a34ae4fb6fc37c3ac1a6db325021c9274 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -166,9 +166,20 @@ bool OpsTestNet::Setup(mace::DeviceType device) { !ws_.GetTensor(input)->is_weight()) { auto input_info = net_def.add_input_info(); input_info->set_name(input); - auto data_format = ProtoArgHelper::GetOptionalArg( - op_def, "data_format", DataFormat::DF_NONE); - input_info->set_data_format(data_format); + auto has_data_format = ProtoArgHelper::GetOptionalArg( + op_def, "has_data_format", 1); + auto is_quantized_op = ProtoArgHelper::GetOptionalArg( + op_def, "T", static_cast(DT_FLOAT)) + == static_cast(DT_UINT8); + if (has_data_format) { + if (is_quantized_op || device == DeviceType::GPU) { + input_info->set_data_format(NHWC); + } else { + input_info->set_data_format(NCHW); + } + } else { + input_info->set_data_format(DataFormat::DF_NONE); + } auto &shape = ws_.GetTensor(input)->shape(); for (auto d : shape) { input_info->add_dims(static_cast(d)); diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index ad1ded81ff1e2c70b59bb5028ff704b4c615c72a..7c20edd9fdd07e5fef11b5719f170d8c35946f51 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -40,9 +40,9 @@ class PadOp : public Operation { constant_value_(Operation::GetOptionalArg( "constant_value", 0.0)) { MACE_CHECK(paddings_.size() == 8); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df) { paddings_ = TransposeShape(paddings_, {0, 1, 6, 7, 2, 3, 4, 5}); } } @@ -55,11 +55,9 @@ class PadOp : public Operation { this->paddings_.size() == static_cast(input->dim_size()) * 2); auto input_shape = input->shape(); for (size_t i = 0; i < paddings_.size(); ++i) { - if (type_ == PadType::REFLECT) { - MACE_CHECK(paddings_[i] < input_shape[i / 2]); - - } else if (type_ == PadType::SYMMETRIC) { - MACE_CHECK(paddings_[i] <= input_shape[i / 2]); + if (type_ == PadType::REFLECT || type_ == PadType::SYMMETRIC) { + MACE_CHECK(paddings_[i] < input_shape[i / 2], paddings_[i], + " vs ", input_shape[i / 2]); } MACE_CHECK(paddings_[i] >= 0); } diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index 0466aa6be486d5f5917f4397006e5cdc4619179e..b449e02f9166c21620daf289baac89b34c25b37f 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -29,7 +29,11 @@ void Pad(int iters, int batch, int height, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + } else { + net.AddRandomInput("Input", {batch, height, width, channels}); + } const std::vector paddings = {0, 0, pad, pad, pad, pad, 0, 0}; OpDefBuilder("Pad", "PadTest") @@ -37,6 +41,7 @@ void Pad(int iters, int batch, int height, .Output("Output") .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) + .AddIntArg("has_data_format", 1) .AddFloatArg("constant_value", 1.0) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index 63bb449f25057ae8335dc95a6d52042dec2186c6..e68e8eb8d06b864b9c9173ada5fbb2312ec0566c 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -39,7 +39,7 @@ void SimpleConstant() { .Output("Output") .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -52,7 +52,7 @@ void SimpleConstant() { .Output("TOutput") .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -101,7 +101,7 @@ void Result(const std::vector &input_shape, .Output(t_output) .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", static_cast(pad_type)) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -179,7 +179,7 @@ TEST_F(PadTest, ComplexCPU) { .Output("TOutput") .AddIntsArg("paddings", {0, 0, 1, 1, 1, 1, 1, 1}) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -217,7 +217,7 @@ void Complex(const std::vector &input_shape, .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run @@ -234,7 +234,7 @@ void Complex(const std::vector &input_shape, .AddIntsArg("paddings", paddings) .AddIntArg("pad_type", pad_type) .AddFloatArg("constant_value", 1.0) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run diff --git a/mace/ops/reduce.cc b/mace/ops/reduce.cc index e4726987025a65560d5768746afae68298d98b9c..860660be825fbdb2cf980e8b136cfde0909bd155 100644 --- a/mace/ops/reduce.cc +++ b/mace/ops/reduce.cc @@ -94,9 +94,9 @@ class ReduceOp : public ReduceOpBase { int index = axis_[i] >= 0 ? axis_[i] : axis_[i] + input->dim_size(); - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && DataTypeToEnum::value != DT_UINT8 + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && DataTypeToEnum::value != DT_UINT8 && input->dim_size() == 4) { if (index == 1 || index == 2) index = index + 1; else if (index == 3) index = 1; diff --git a/mace/ops/reduce_benchmark.cc b/mace/ops/reduce_benchmark.cc index d97131672c2fba7d988b0e5118a410b54acc571a..1d5fbe33ccb10dc7ffbef9b00353ed93889691fd 100644 --- a/mace/ops/reduce_benchmark.cc +++ b/mace/ops/reduce_benchmark.cc @@ -38,6 +38,7 @@ void Reduce(int iters, int batch, int channels, .Input("Input") .AddIntsArg("axis", axis) .Output("OutputImage") + .AddIntArg("has_data_format", 1) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/reduce_test.cc b/mace/ops/reduce_test.cc index 78a9f9345a8ca4da9eae0a0beedcb8dd1fbed49c..fc284084b25dfe7aac2c6fb936953dbe98e75212 100644 --- a/mace/ops/reduce_test.cc +++ b/mace/ops/reduce_test.cc @@ -44,7 +44,7 @@ void Simple(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -56,7 +56,7 @@ void Simple(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); // Run @@ -84,7 +84,7 @@ void Simple3D(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); // Run @@ -588,7 +588,7 @@ void RandomTest(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run @@ -600,7 +600,7 @@ void RandomTest(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OPENCLOutput") .Finalize(net.NewOperatorDef()); // Run @@ -662,7 +662,7 @@ void TestQuant(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("OutputNCHW") .AddIntArg("T", DT_FLOAT) .Finalize(net.NewOperatorDef()); @@ -687,7 +687,7 @@ void TestQuant(const std::vector &input_shape, .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .AddIntArg("reduce_type", type) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .AddIntArg("T", DT_UINT8) .Finalize(net.NewOperatorDef()); net.RunOp(); diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc index f082cf31a9dbf35aad4ce2ca65c5f4cb6d5679e7..33850faf57fe2332419c5157a4fc89205125ddfa 100644 --- a/mace/ops/reshape.cc +++ b/mace/ops/reshape.cc @@ -77,9 +77,9 @@ class ReshapeOp : public Operation { } Tensor *output = this->Output(OUTPUT); // NHWC -> NCHW - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && D == DeviceType::CPU + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && D == DeviceType::CPU && out_shape.size() == 4 && shape->is_weight()) { std::vector dst_dims = {0, 3, 1, 2}; std::vector out_shape_gpu = TransposeShape( diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index 79d05bdcaf27bd9dc8cc49f14254d5c1316beaa2..1ee3ee02fce701edf62398dfce12c234b8df78c9 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -35,11 +35,10 @@ class ShapeOp : public Operation { Tensor::MappingGuard output_guard(output); int32_t *output_data = output->mutable_data(); - const int data_format = - Operation::GetOptionalArg("data_format", 0); - if (input->dim_size() == 4 && - D == DeviceType::CPU && - data_format == DataFormat::NCHW) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (D == DeviceType::CPU && + has_df && input->dim_size() == 4) { // transpose NCHW to NHWC for cpu runtime output_data[0] = static_cast(input->dim(0)); output_data[1] = static_cast(input->dim(2)); diff --git a/mace/ops/split.cc b/mace/ops/split.cc index 1ac77cfbec2befb4e1edbd8568ffaf5aa218ce79..7c920d4c115f9650973ab62a2c79d29b677faf83 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -36,9 +36,9 @@ class SplitOp : public Operation { checked_(false) {} void Validate() { - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_ == 3) axis_ = 1; else if (axis_ == 2) axis_ = 3; else if (axis_ == 1) axis_ = 2; diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index 45331685059228b32ef92f7abffbc98791d90d0b..17584778a8ae93994530bdbad9f8a53d476b1e18 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -44,6 +44,7 @@ void BMSplitHelper(int iters, } builder .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Warm-up diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc index 726d12e6fae54054d504b1a5a07fb9aa70a4e8e5..b693fd0cd3da81e00c5627a852ef6e1c7b97b4c7 100644 --- a/mace/ops/split_test.cc +++ b/mace/ops/split_test.cc @@ -54,7 +54,7 @@ void RandomTest(const int num_outputs, int axis) { builder = builder.Output(MakeString("Output", i)); } builder.AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Finalize(net.NewOperatorDef()); // Run diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index e67d2672b4df63795cb63bbce9b0e4960d33fa43..15c3408c2bbbfbc6832af699045036d1580152c7 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -32,9 +32,9 @@ class SqueezeOp : public Operation { MACE_UNUSED(context); if (!checked_ && D == DeviceType::CPU && DataTypeToEnum::value != DT_UINT8) { - auto df = static_cast(Operation::GetOptionalArg( - "data_format", DataFormat::DF_NONE)); - if (df == DataFormat::NHWC && this->Input(0)->dim_size() == 4) { + auto has_df = Operation::GetOptionalArg( + "has_data_format", 0); + if (has_df && this->Input(0)->dim_size() == 4) { if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) { axis_[0] = 2; axis_[1] = 3; diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc index 3c27f6b9c0ca127726c04599012698a8d4a5d236..8cd829794c16c71b3df1853fedc79eed75d317a8 100644 --- a/mace/ops/squeeze_test.cc +++ b/mace/ops/squeeze_test.cc @@ -30,7 +30,7 @@ void TestSqueeze(const std::vector &org_shape, OpDefBuilder("Squeeze", "SqueezeTest") .Input("Input") .AddIntsArg("axis", axis) - .AddIntArg("data_format", DataFormat::NHWC) + .AddIntArg("has_data_format", 1) .Output("Output") .Finalize(net.NewOperatorDef()); diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 56f2cceca6863672fa168209504187142ad83d05..0d309d40ade3bd50e660fdeae7ed71ee7055fb3d 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -42,6 +42,7 @@ device_type_map = {'cpu': cvt.DeviceType.CPU.value, data_format_map = { 'NONE': cvt.DataFormat.DF_NONE, 'NHWC': cvt.DataFormat.NHWC, + 'NCHW': cvt.DataFormat.NCHW, } @@ -74,6 +75,13 @@ def parse_float_array_from_str(ints_str): return [float(int_str) for int_str in ints_str.split(',')] +def transpose_shape(shape, dst_order): + t_shape = [0] * len(shape) + for i in range(len(shape)): + t_shape[i] = shape[dst_order[i]] + return t_shape + + def main(unused_args): if not os.path.isfile(FLAGS.model_file): six.print_("Input graph file '" + @@ -139,6 +147,10 @@ def main(unused_args): else: input_node.data_format = data_format_map[input_node_formats[i]] input_node.shape = parse_int_array_from_str(input_node_shapes[i]) + if input_node.data_format == cvt.DataFormat.NCHW and\ + len(input_node.shape) == 4: + input_node.shape = transpose_shape(input_node.shape, [0, 2, 3, 1]) + input_node.data_format = cvt.DataFormat.NHWC if len(input_node_ranges) > i: input_node.range = parse_float_array_from_str(input_node_ranges[i]) option.add_input_node(input_node) @@ -156,6 +168,11 @@ def main(unused_args): else: output_node.data_format = data_format_map[output_node_formats[i]] output_node.shape = parse_int_array_from_str(output_node_shapes[i]) + if output_node.data_format == cvt.DataFormat.NCHW and\ + len(output_node.shape) == 4: + output_node.shape = transpose_shape(output_node.shape, + [0, 2, 3, 1]) + output_node.data_format = cvt.DataFormat.NHWC option.add_output_node(output_node) if FLAGS.check_node != '': diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 03b1e7c3eff6f99dab1e1a36c1dd25edd0ba75a5..4d2b841f58df13f839aae6a4d9391c4bc0f803a4 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -181,6 +181,7 @@ class MaceKeyword(object): mace_global_pooling_str = 'global_pooling' mace_kernel_str = 'kernels' mace_data_format_str = 'data_format' + mace_has_data_format_str = 'has_data_format' mace_filter_format_str = 'filter_format' mace_element_type_str = 'type' mace_activation_type_str = 'activation' @@ -525,6 +526,16 @@ class ConverterUtil(object): return arg return None + @staticmethod + def del_arg(op, arg_name): + found_idx = -1 + for idx in range(len(op.arg)): + if op.arg[idx].name == arg_name: + found_idx = idx + break + if found_idx != -1: + del op.arg[found_idx] + @staticmethod def add_data_format_arg(op, data_format): data_format_arg = op.arg.add() diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 4a170646752f411bcd623606ac541408b3963387..91ffee89be82c6c621d9b9ae599346ef9d62149d 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -1406,21 +1406,17 @@ class Transformer(base_converter.ConverterInterface): def update_data_format(self): print("update data format") - data_format_flag = DataFormat.NHWC.value + data_format_flag = 1 for input_node in self._option.input_nodes.values(): if input_node.data_format.value == DataFormat.DF_NONE.value: - data_format_flag = DataFormat.DF_NONE.value - + data_format_flag = 0 net = self._model for op in net.op: - data_format_arg = ConverterUtil.get_arg( + ConverterUtil.del_arg( op, MaceKeyword.mace_data_format_str) - if not data_format_arg: - data_format_arg = op.arg.add() - data_format_arg.name = MaceKeyword.mace_data_format_str - data_format_arg.i = data_format_flag - elif data_format_arg.i != data_format_flag: - data_format_arg.i = data_format_flag + has_data_format_arg = op.arg.add() + has_data_format_arg.name = MaceKeyword.mace_has_data_format_str + has_data_format_arg.i = data_format_flag return False def quantize_nodes(self): diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 6124792c5f1e395777b3874860e570173cad51c8..f13d05b621c9d32e659b3b908b7fe85836112b7a 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -46,6 +46,7 @@ void MaceRunFunc(const int in_out_size) { for (size_t i = 0; i < input_names.size(); ++i) { InputInfo *info = net_def->add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names[i]); for (auto d : input_shapes[0]) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 438683fec2f694b73ac0d5b132bb73f1bf6377db..49a208fd7fcfda3c625f79ae350cf5245b4fabfd 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -45,6 +45,7 @@ void MaceRun(const int in_out_size, for (size_t i = 0; i < input_names.size(); ++i) { InputInfo *info = net_def->add_input_info(); + info->set_data_format(DataFormat::NHWC); info->set_name(input_names[i]); for (auto d : max_shape) { info->add_dims(static_cast(d)); diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h index 2c2ed7d177fb2b1d834f427a5ecfaa956fe7e648..2257b2162ca6d53e81fd29367594bf860ff115ec 100644 --- a/mace/test/mace_api_test.h +++ b/mace/test/mace_api_test.h @@ -76,6 +76,7 @@ void Conv3x3(const std::string &input_name, .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("has_data_format", 1) .Finalize(&operator_def); OutputShape *shape = operator_def.add_output_shape(); @@ -98,6 +99,7 @@ void Relu(const std::string &input_name, .AddStringArg("activation", "RELU") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddIntArg("device", static_cast(device_type)) + .AddIntArg("has_data_format", 1) .Finalize(&operator_def); net_def->add_op()->CopyFrom(operator_def); diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 7ea06f089eca63a189edac2306641bac81e39c7f..b6ee678600b770cd3cb2793200c51d2833a5c093 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -103,6 +103,16 @@ DeviceType ParseDeviceType(const std::string &device_str) { } } +DataFormat ParseDataFormat(const std::string &data_format_str) { + if (data_format_str == "NHWC") { + return DataFormat::NHWC; + } else if (data_format_str == "NCHW") { + return DataFormat::NCHW; + } else { + return DataFormat::DF_NONE; + } +} + struct mallinfo LogMallinfoChange(struct mallinfo prev) { struct mallinfo curr = mallinfo(); if (prev.arena != curr.arena) { @@ -168,6 +178,12 @@ DEFINE_string(output_node, DEFINE_string(output_shape, "1,224,224,2:1,1,1,10", "output shapes, separated by colon and comma"); +DEFINE_string(input_data_format, + "NHWC", + "input data formats, NONE|NHWC|NCHW"); +DEFINE_string(output_data_format, + "NHWC", + "output data formats, NONE|NHWC|NCHW"); DEFINE_string(input_file, "", "input file name | input file prefix for multiple inputs."); @@ -206,8 +222,10 @@ DEFINE_int32(cpu_affinity_policy, 1, bool RunModel(const std::string &model_name, const std::vector &input_names, const std::vector> &input_shapes, + const std::vector &input_data_formats, const std::vector &output_names, - const std::vector> &output_shapes) { + const std::vector> &output_shapes, + const std::vector &output_data_formats) { DeviceType device_type = ParseDeviceType(FLAGS_device); int64_t t0 = NowMicros(); @@ -325,7 +343,8 @@ bool RunModel(const std::string &model_name, LOG(INFO) << "Open input file failed"; return -1; } - inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in); + inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in, + input_data_formats[i]); } for (size_t i = 0; i < output_count; ++i) { @@ -334,7 +353,8 @@ bool RunModel(const std::string &model_name, std::multiplies()); auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); - outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out); + outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, + output_data_formats[i]); } LOG(INFO) << "Warm up run"; @@ -498,13 +518,27 @@ int Main(int argc, char **argv) { for (size_t i = 0; i < output_count; ++i) { ParseShape(output_shapes[i], &output_shape_vec[i]); } + std::vector raw_input_data_formats = + str_util::Split(FLAGS_input_data_format, ','); + std::vector raw_output_data_formats = + str_util::Split(FLAGS_output_data_format, ','); + std::vector input_data_formats(input_count); + std::vector output_data_formats(output_count); + for (size_t i = 0; i < input_count; ++i) { + input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]); + } + for (size_t i = 0; i < output_count; ++i) { + output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]); + } + bool ret = false; for (int i = 0; i < FLAGS_restart_round; ++i) { VLOG(0) << "restart round " << i; ret = - RunModel(FLAGS_model_name, input_names, input_shape_vec, - output_names, output_shape_vec); + RunModel(FLAGS_model_name, + input_names, input_shape_vec, input_data_formats, + output_names, output_shape_vec, output_data_formats); } if (ret) { return 0; diff --git a/tools/common.py b/tools/common.py index 6aa7d6326ecb3e40039ed5888010d41e20e753f7..aa4bf9cd1cdf2780ac23b4fc229d4ebefbe409e9 100644 --- a/tools/common.py +++ b/tools/common.py @@ -131,6 +131,12 @@ class DeviceType(object): HEXAGON = 'HEXAGON' +class DataFormat(object): + NONE = "NONE" + NHWC = "NHWC" + NCHW = "NCHW" + + ################################ # Argument types ################################ diff --git a/tools/converter.py b/tools/converter.py index 36e90c813d840e2c1e673e9b5e4403dcc44b6070..c1afcb58354a813164da8c36b214be19a501dff4 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -96,14 +96,10 @@ WinogradParameters = [0, 2, 4] DataFormatStrs = [ "NONE", "NHWC", + "NCHW", ] -class DataFormat(object): - NONE = "NONE" - NHWC = "NHWC" - - class DefaultValues(object): mace_lib_type = MACELibType.static omp_num_threads = -1, @@ -371,6 +367,15 @@ def format_model_config(flags): if not isinstance(value, list): subgraph[key] = [value] subgraph[key] = [str(v) for v in subgraph[key]] + input_size = len(subgraph[YAMLKeyword.input_tensors]) + output_size = len(subgraph[YAMLKeyword.output_tensors]) + + mace_check(len(subgraph[YAMLKeyword.input_shapes]) == input_size, + ModuleName.YAML_CONFIG, + "input shapes' size not equal inputs' size.") + mace_check(len(subgraph[YAMLKeyword.output_shapes]) == output_size, + ModuleName.YAML_CONFIG, + "output shapes' size not equal outputs' size.") for key in [YAMLKeyword.check_tensors, YAMLKeyword.check_shapes]: @@ -399,13 +404,13 @@ def format_model_config(flags): if input_data_formats: if not isinstance(input_data_formats, list): subgraph[YAMLKeyword.input_data_formats] =\ - [input_data_formats] + [input_data_formats] * input_size else: mace_check(len(input_data_formats) - == len(subgraph[YAMLKeyword.input_tensors]), + == input_size, ModuleName.YAML_CONFIG, "input_data_formats should match" - " the size of input") + " the size of input.") for input_data_format in\ subgraph[YAMLKeyword.input_data_formats]: mace_check(input_data_format in DataFormatStrs, @@ -414,17 +419,18 @@ def format_model_config(flags): + str(DataFormatStrs) + ", but got " + input_data_format) else: - subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC] + subgraph[YAMLKeyword.input_data_formats] = \ + [DataFormat.NHWC] * input_size output_data_formats = subgraph.get(YAMLKeyword.output_data_formats, []) if output_data_formats: if not isinstance(output_data_formats, list): subgraph[YAMLKeyword.output_data_formats] = \ - [output_data_formats] + [output_data_formats] * output_size else: mace_check(len(output_data_formats) - == len(subgraph[YAMLKeyword.output_tensors]), + == output_size, ModuleName.YAML_CONFIG, "output_data_formats should match" " the size of output") @@ -435,7 +441,8 @@ def format_model_config(flags): "'output_data_formats' must be in " + str(DataFormatStrs)) else: - subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC] + subgraph[YAMLKeyword.output_data_formats] =\ + [DataFormat.NHWC] * output_size validation_threshold = subgraph.get( YAMLKeyword.validation_threshold, {}) diff --git a/tools/device.py b/tools/device.py index 5bc788f5230c744a3aaa57639c90cea352769fd1..42936a9b48b45e5c0a4a1e122dc42364713efecf 100644 --- a/tools/device.py +++ b/tools/device.py @@ -154,7 +154,9 @@ class DeviceWrapper: input_nodes, output_nodes, input_shapes, + input_data_formats, output_shapes, + output_data_formats, mace_model_dir, model_tag, device_type, @@ -216,6 +218,8 @@ class DeviceWrapper: "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--input_file=%s/%s" % (model_output_dir, input_file_name), "--output_file=%s/%s" % (model_output_dir, @@ -307,6 +311,8 @@ class DeviceWrapper: "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--input_file=%s/%s" % (self.data_dir, input_file_name), "--output_file=%s/%s" % (self.data_dir, output_file_name), "--input_dir=%s" % input_dir, @@ -394,6 +400,8 @@ class DeviceWrapper: output_nodes=subgraphs[0][YAMLKeyword.output_tensors], input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=subgraphs[0][YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][YAMLKeyword.output_data_formats], mace_model_dir=mace_model_dir, model_tag=model_name, device_type=DeviceType.GPU, @@ -587,6 +595,10 @@ class DeviceWrapper: YAMLKeyword.output_tensors], input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=output_config[YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], mace_model_dir=mace_model_dir, model_tag=model_name, device_type=device_type, @@ -652,6 +664,10 @@ class DeviceWrapper: YAMLKeyword.input_shapes], output_shapes=output_config[ YAMLKeyword.output_shapes], + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], model_output_dir=model_output_dir, input_data_types=subgraphs[0][ YAMLKeyword.input_data_types], @@ -750,6 +766,8 @@ class DeviceWrapper: output_nodes, input_shapes, output_shapes, + input_data_formats, + output_data_formats, max_num_runs, max_seconds, model_tag, @@ -790,6 +808,8 @@ class DeviceWrapper: '--output_node=%s' % ','.join(output_nodes), '--input_shape=%s' % ':'.join(input_shapes), '--output_shape=%s' % ':'.join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), '--input_file=%s/%s' % (model_output_dir, input_file_name), "--model_data_file=%s" % model_data_file, '--max_num_runs=%d' % max_num_runs, @@ -845,6 +865,8 @@ class DeviceWrapper: '--output_node=%s' % ','.join(output_nodes), '--input_shape=%s' % ':'.join(input_shapes), '--output_shape=%s' % ':'.join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), '--input_file=%s/%s' % (self.data_dir, input_file_name), "--model_data_file=%s" % model_data_file, '--max_num_runs=%d' % max_num_runs, @@ -961,6 +983,10 @@ class DeviceWrapper: output_nodes=output_nodes, input_shapes=subgraphs[0][YAMLKeyword.input_shapes], output_shapes=output_shapes, + input_data_formats=subgraphs[0][ + YAMLKeyword.input_data_formats], + output_data_formats=subgraphs[0][ + YAMLKeyword.output_data_formats], max_num_runs=flags.max_num_runs, max_seconds=flags.max_seconds, mace_model_dir=mace_model_dir, @@ -974,8 +1000,7 @@ class DeviceWrapper: opencl_binary_file=opencl_output_bin_path, opencl_parameter_file=opencl_parameter_path, libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH, - link_dynamic=link_dynamic - ) + link_dynamic=link_dynamic) def run(self, abi, diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 399bb3d473fb7a67b967213e2b5b3513ce469e86..11d6934beed8e269c7bad27fad9f718fa2d260e1 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -649,6 +649,8 @@ def validate_model(abi, output_nodes, input_shapes, output_shapes, + input_data_formats, + output_data_formats, model_output_dir, input_data_types, caffe_env, @@ -671,20 +673,12 @@ def validate_model(abi, sh.rm("-rf", "%s/%s" % (model_output_dir, formatted_name)) device.pull_from_data_dir(formatted_name, model_output_dir) - if platform == "tensorflow": - validate(platform, model_file_path, "", - "%s/%s" % (model_output_dir, input_file_name), - "%s/%s" % (model_output_dir, output_file_name), device_type, - ":".join(input_shapes), ":".join(output_shapes), - ",".join(input_nodes), ",".join(output_nodes), - validation_threshold, ",".join(input_data_types), backend, - validation_outputs_data, - log_file) - elif platform == "onnx": + if platform == "tensorflow" or platform == "onnx": validate(platform, model_file_path, "", "%s/%s" % (model_output_dir, input_file_name), "%s/%s" % (model_output_dir, output_file_name), device_type, ":".join(input_shapes), ":".join(output_shapes), + ",".join(input_data_formats), ",".join(output_data_formats), ",".join(input_nodes), ",".join(output_nodes), validation_threshold, ",".join(input_data_types), backend, validation_outputs_data, @@ -703,6 +697,8 @@ def validate_model(abi, "%s/%s" % (model_output_dir, output_file_name), device_type, ":".join(input_shapes), ":".join(output_shapes), + ",".join(input_data_formats), + ",".join(output_data_formats), ",".join(input_nodes), ",".join(output_nodes), validation_threshold, ",".join(input_data_types), backend, validation_outputs_data, @@ -770,6 +766,8 @@ def validate_model(abi, "--output_node=%s" % ",".join(output_nodes), "--input_shape=%s" % ":".join(input_shapes), "--output_shape=%s" % ":".join(output_shapes), + "--input_data_format=%s" % ",".join(input_data_formats), + "--output_data_format=%s" % ",".join(output_data_formats), "--validation_threshold=%f" % validation_threshold, "--input_data_type=%s" % ",".join(input_data_types), "--backend=%s" % ",".join(backend), diff --git a/tools/validate.py b/tools/validate.py index 7b2703c40577deccb68aaebf3a92922f1560c672..d4811ffa1e847c99d150c08fcfc3ef5f3baf2077 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -148,10 +148,11 @@ def validate_with_file(platform, device_type, value, validation_threshold, log_file) -def validate_tf_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, validation_threshold, input_data_types, - log_file): +def validate_tf_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, log_file): import tensorflow as tf if not os.path.isfile(model_file): common.MaceLogger.error( @@ -174,6 +175,9 @@ def validate_tf_model(platform, device_type, model_file, input_file, common.formatted_file_name(input_file, input_names[i]), input_data_types[i]) input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NCHW and\ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 2, 3, 1)) input_node = graph.get_tensor_by_name( normalize_tf_tensor_name(input_names[i])) input_dict[input_node] = input_value @@ -188,15 +192,20 @@ def validate_tf_model(platform, device_type, model_file, input_file, output_file_name = common.formatted_file_name( mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NCHW and\ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.\ + reshape(output_shapes[i]).transpose((0, 2, 3, 1)) compare_output(platform, device_type, output_names[i], mace_out_value, output_values[i], validation_threshold, log_file) def validate_caffe_model(platform, device_type, model_file, input_file, - mace_out_file, weight_file, input_names, input_shapes, - output_names, output_shapes, validation_threshold, - log_file): + mace_out_file, weight_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, log_file): os.environ['GLOG_minloglevel'] = '1' # suprress Caffe verbose prints import caffe if not os.path.isfile(model_file): @@ -215,8 +224,10 @@ def validate_caffe_model(platform, device_type, model_file, input_file, for i in range(len(input_names)): input_value = load_data( common.formatted_file_name(input_file, input_names[i])) - input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, - 2)) + input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NHWC and \ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 3, 1, 2)) input_blob_name = input_names[i] try: if input_names[i] in net.top_names: @@ -232,22 +243,23 @@ def validate_caffe_model(platform, device_type, model_file, input_file, for i in range(len(output_names)): value = net.blobs[output_names[i]].data - out_shape = output_shapes[i] - if len(out_shape) == 4: - out_shape[1], out_shape[2], out_shape[3] = \ - out_shape[3], out_shape[1], out_shape[2] - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) output_file_name = common.formatted_file_name( mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NHWC and \ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.reshape(output_shapes[i])\ + .transpose((0, 3, 1, 2)) compare_output(platform, device_type, output_names[i], mace_out_value, value, validation_threshold, log_file) -def validate_onnx_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, output_shapes, validation_threshold, - input_data_types, backend, log_file): +def validate_onnx_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, + backend, log_file): import onnx if backend == "tensorflow": from onnx_tf.backend import prepare @@ -269,13 +281,16 @@ def validate_onnx_model(platform, device_type, model_file, input_file, input_value = load_data(common.formatted_file_name(input_file, input_names[i]), input_data_types[i]) - input_value = input_value.reshape(input_shapes[i]).transpose((0, 3, 1, - 2)) + input_value = input_value.reshape(input_shapes[i]) + if input_data_formats[i] == common.DataFormat.NHWC and \ + len(input_shapes[i]) == 4: + input_value = input_value.transpose((0, 3, 1, 2)) input_dict[input_names[i]] = input_value onnx_outputs = [] for i in range(len(output_names)): out_shape = output_shapes[i] - if len(out_shape) == 4: + if output_data_formats[i] == common.DataFormat.NHWC and\ + len(out_shape) == 4: out_shape[1], out_shape[2], out_shape[3] = \ out_shape[3], out_shape[1], out_shape[2] onnx_outputs.append( @@ -289,25 +304,32 @@ def validate_onnx_model(platform, device_type, model_file, input_file, for i in range(len(output_names)): out_name = output_names[i] value = output_values[out_name].flatten() - out_shape = output_shapes[i] - if len(out_shape) == 4: - value = value.reshape(out_shape).transpose((0, 2, 3, 1)) output_file_name = common.formatted_file_name(mace_out_file, output_names[i]) mace_out_value = load_data(output_file_name) + if output_data_formats[i] == common.DataFormat.NHWC and \ + len(output_shapes[i]) == 4: + mace_out_value = mace_out_value.reshape(output_shapes[i]) \ + .transpose((0, 3, 1, 2)) compare_output(platform, device_type, output_names[i], mace_out_value, value, validation_threshold, log_file) def validate(platform, model_file, weight_file, input_file, mace_out_file, - device_type, input_shape, output_shape, input_node, output_node, + device_type, input_shape, output_shape, input_data_format_str, + output_data_format_str, input_node, output_node, validation_threshold, input_data_type, backend, validation_outputs_data, log_file): input_names = [name for name in input_node.split(',')] input_shape_strs = [shape for shape in input_shape.split(':')] input_shapes = [[int(x) for x in shape.split(',')] for shape in input_shape_strs] + output_shape_strs = [shape for shape in output_shape.split(':')] + output_shapes = [[int(x) for x in shape.split(',')] + for shape in output_shape_strs] + input_data_formats = [df for df in input_data_format_str.split(',')] + output_data_formats = [df for df in output_data_format_str.split(',')] if input_data_type: input_data_types = [data_type for data_type in input_data_type.split(',')] @@ -323,32 +345,27 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file, else: validation_outputs = validation_outputs_data if validation_outputs: - output_shape_strs = [shape for shape in output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] - for shape in output_shape_strs] validate_with_file(platform, device_type, output_names, output_shapes, mace_out_file, validation_outputs, validation_threshold, log_file) elif platform == 'tensorflow': - validate_tf_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, validation_threshold, input_data_types, + validate_tf_model(platform, device_type, + model_file, input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, + validation_threshold, input_data_types, log_file) elif platform == 'caffe': - output_shape_strs = [shape for shape in output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] - for shape in output_shape_strs] - validate_caffe_model(platform, device_type, model_file, input_file, - mace_out_file, weight_file, input_names, - input_shapes, output_names, output_shapes, + validate_caffe_model(platform, device_type, model_file, + input_file, mace_out_file, weight_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, validation_threshold, log_file) elif platform == 'onnx': - output_shape_strs = [shape for shape in output_shape.split(':')] - output_shapes = [[int(x) for x in shape.split(',')] - for shape in output_shape_strs] - validate_onnx_model(platform, device_type, model_file, input_file, - mace_out_file, input_names, input_shapes, - output_names, output_shapes, + validate_onnx_model(platform, device_type, model_file, + input_file, mace_out_file, + input_names, input_shapes, input_data_formats, + output_names, output_shapes, output_data_formats, validation_threshold, input_data_types, backend, log_file) @@ -379,8 +396,14 @@ def parse_args(): "--device_type", type=str, default="", help="mace runtime device.") parser.add_argument( "--input_shape", type=str, default="1,64,64,3", help="input shape.") + parser.add_argument( + "--input_data_format", type=str, default="NHWC", + help="input data format.") parser.add_argument( "--output_shape", type=str, default="1,64,64,2", help="output shape.") + parser.add_argument( + "--output_data_format", type=str, default="NHWC", + help="output data format.") parser.add_argument( "--input_node", type=str, default="input_node", help="input node") parser.add_argument( @@ -417,6 +440,8 @@ if __name__ == '__main__': FLAGS.device_type, FLAGS.input_shape, FLAGS.output_shape, + FLAGS.input_data_format, + FLAGS.output_data_format, FLAGS.input_node, FLAGS.output_node, FLAGS.validation_threshold,