From 1ca2f9e2080dd5f96425f422dd163263aff25074 Mon Sep 17 00:00:00 2001 From: liuqi Date: Tue, 21 May 2019 11:35:11 +0800 Subject: [PATCH] Fix some bugs. 1. OpenCL Runtime destructor bug. 2. Net's outputs are used in net again. 3. Average pooling precision bug. --- mace/core/net_def_adapter.cc | 34 +++++++++++++++---- mace/core/net_def_adapter.h | 3 +- mace/core/runtime/opencl/opencl_runtime.cc | 4 ++- mace/ops/pooling.cc | 38 +++++++++++++++------- 4 files changed, 59 insertions(+), 20 deletions(-) diff --git a/mace/core/net_def_adapter.cc b/mace/core/net_def_adapter.cc index 5d3915b4..2e450b09 100644 --- a/mace/core/net_def_adapter.cc +++ b/mace/core/net_def_adapter.cc @@ -164,6 +164,7 @@ MaceStatus NetDefAdapter::AdaptNetDef( input_info->set_dims(j, input_shape[j]); } } + tensor_shape_map.emplace(input_info->name(), input_shape); output_map.emplace(input_info->name(), InternalOutputInfo( mem_type, input_info->data_type(), input_data_format, input_shape, -1)); @@ -220,6 +221,13 @@ MaceStatus NetDefAdapter::AdaptNetDef( &op_output_data_format, target_net_def)); } + input_size = op_def.input_size(); + for (int i = 0; i < input_size; ++i) { + if (output_map.count(op_def.input(i)) == 1) { + output_map.at(op_def.input(i)).consumer_op_indices.push_back( + target_net_def->op_size()); + } + } int output_size = op_def.output_size(); for (int out_idx = 0; out_idx < output_size; ++out_idx) { @@ -276,6 +284,15 @@ MaceStatus NetDefAdapter::AdaptNetDef( output_op_def->set_output(i, t_output_name); } } + for (int idx : internal_output_info.consumer_op_indices) { + auto consumer_op_def = target_net_def->mutable_op(idx); + int input_size = consumer_op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + if (consumer_op_def->input(i) == output_info.name()) { + consumer_op_def->set_input(i, t_output_name); + } + } + } auto transformed_op_def = target_net_def->add_op(); OpenCLUtil::BuildTransformOpDef( t_output_name, @@ -414,12 +431,10 @@ MaceStatus NetDefAdapter::AdaptDataFormat( } src_df = output_map->at(op_def->input(i)).data_format; dst_df = inputs_data_format[i]; - if (src_df == DataFormat::NONE - || dst_df == DataFormat::NONE - || output_map->at(op_def->input(i)).shape.size() != 4) { - continue; - } - if (src_df != dst_df) { + if (src_df != DataFormat::NONE + && dst_df != DataFormat::NONE + && output_map->at(op_def->input(i)).shape.size() == 4 + && src_df != dst_df) { std::string transformed_name = TransformedName(op_def->input(i), "data_format", static_cast(dst_df)); if (transformed_set->count(transformed_name) == 0) { @@ -461,6 +476,9 @@ MaceStatus NetDefAdapter::AdaptDataFormat( SetProtoArg(transpose_op_def, OutputMemoryTypeTagName(), target_mem_type); + // update tensor consumer information + output_map->at(op_def->input(i)).consumer_op_indices.push_back( + target_net_def->op_size() - 1); // update output information map output_map->emplace( @@ -545,6 +563,10 @@ MaceStatus NetDefAdapter::AdaptMemoryType( OutputMemoryTypeTagName(), dst_mem_type); + // update tensor consumer information + output_map->at(op_def->input(i)).consumer_op_indices.push_back( + target_net_def->op_size() - 1); + // update output information map output_map->emplace( transformed_name, diff --git a/mace/core/net_def_adapter.h b/mace/core/net_def_adapter.h index d924d84c..0268329e 100644 --- a/mace/core/net_def_adapter.h +++ b/mace/core/net_def_adapter.h @@ -78,13 +78,14 @@ class NetDefAdapter { const std::vector &shape, int op_idx) : mem_type(mem_type), dtype(dtype), data_format(data_format), - shape(shape), op_idx(op_idx) {} + shape(shape), op_idx(op_idx), consumer_op_indices() {} MemoryType mem_type; DataType dtype; DataFormat data_format; std::vector shape; // tensor shape int op_idx; // operation which generate the tensor + std::vector consumer_op_indices; }; typedef std::unordered_map TensorInfoMap; diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index e26c6048..791e6013 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -436,7 +436,9 @@ OpenCLRuntime::OpenCLRuntime( } OpenCLRuntime::~OpenCLRuntime() { - command_queue_->finish(); + if (command_queue_ != nullptr) { + command_queue_->finish(); + } built_program_map_.clear(); // We need to control the destruction order, which has dependencies command_queue_.reset(); diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index ce726dcb..ef72ca93 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -426,8 +426,8 @@ class PoolingOp : public PoolingOpBase { (in_h_end - in_h_begin) * (in_w_end - in_w_begin); MACE_CHECK(block_size > 0); - std::vector average_buffer(channels); - uint16_t *avg_buffer = average_buffer.data(); + std::vector average_buffer(channels); + uint32_t *avg_buffer = average_buffer.data(); std::fill_n(avg_buffer, channels, 0); for (index_t ih = in_h_begin; ih < in_h_end; ++ih) { for (index_t iw = in_w_begin; iw < in_w_end; ++iw) { @@ -436,20 +436,34 @@ class PoolingOp : public PoolingOpBase { index_t c = 0; #if defined(MACE_ENABLE_NEON) for (; c <= channels - 16; c += 16) { - uint16x8_t avg_vec[2]; - avg_vec[0] = vld1q_u16(avg_buffer + c); - avg_vec[1] = vld1q_u16(avg_buffer + c + 8); + uint16x8_t tmp_avg[2]; uint8x16_t in_vec = vld1q_u8(in_ptr + c); - avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec)); - avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec)); - vst1q_u16(avg_buffer + c, avg_vec[0]); - vst1q_u16(avg_buffer + c + 8, avg_vec[1]); + tmp_avg[0] = vmovl_u8(vget_low_u8(in_vec)); + tmp_avg[1] = vmovl_u8(vget_high_u8(in_vec)); + uint32x4_t avg_vec[4]; + avg_vec[0] = vld1q_u32(avg_buffer + c); + avg_vec[1] = vld1q_u32(avg_buffer + c + 4); + avg_vec[2] = vld1q_u32(avg_buffer + c + 8); + avg_vec[3] = vld1q_u32(avg_buffer + c + 12); + avg_vec[0] = vaddw_u16(avg_vec[0], vget_low_u16(tmp_avg[0])); + avg_vec[1] = vaddw_u16(avg_vec[1], vget_high_u16(tmp_avg[0])); + avg_vec[2] = vaddw_u16(avg_vec[2], vget_low_u16(tmp_avg[1])); + avg_vec[3] = vaddw_u16(avg_vec[3], vget_high_u16(tmp_avg[1])); + vst1q_u32(avg_buffer + c, avg_vec[0]); + vst1q_u32(avg_buffer + c + 4, avg_vec[1]); + vst1q_u32(avg_buffer + c + 8, avg_vec[2]); + vst1q_u32(avg_buffer + c + 12, avg_vec[3]); } for (; c <= channels - 8; c += 8) { - uint16x8_t avg_vec = vld1q_u16(avg_buffer + c); uint8x8_t in_vec = vld1_u8(in_ptr + c); - avg_vec = vaddw_u8(avg_vec, in_vec); - vst1q_u16(avg_buffer + c, avg_vec); + uint16x8_t tmp_avg = vmovl_u8(in_vec); + uint32x4_t avg_vec[2]; + avg_vec[0] = vld1q_u32(avg_buffer + c); + avg_vec[1] = vld1q_u32(avg_buffer + c + 4); + avg_vec[0] = vaddw_u16(avg_vec[0], vget_low_u16(tmp_avg)); + avg_vec[1] = vaddw_u16(avg_vec[1], vget_high_u16(tmp_avg)); + vst1q_u32(avg_buffer + c, avg_vec[0]); + vst1q_u32(avg_buffer + c + 4, avg_vec[1]); } #endif for (; c < channels; ++c) { -- GitLab