diff --git a/.travis.yml b/.travis.yml index b0ba55e06293a857defd7943f58ba3e5ed339c46..889c11d3d0557877463188a57d73e76f932ebca0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -114,7 +114,7 @@ jobs: - python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1 - python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1 - echo 'Extra Test' - - python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1 + - python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --run_target=False --target_abis=armeabi-v7a || exit 1 env: TYPE=Extra-Test-ARMEABI-v7a os: linux dist: xenial diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index adb267f3c8bb5361e5b4f929d3888b37b1c014f2..685ad8f2e3edca492b58b66ffc4e0d149146c33c 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -332,18 +332,17 @@ int Main(int argc, char **argv) { std::map inputs; std::map outputs; for (size_t i = 0; i < input_count; ++i) { - // Allocate input and output + // only support float and int32, use char for generalization int64_t input_size = - std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1, + std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4, std::multiplies()); - auto buffer_in = std::shared_ptr(new float[input_size], - std::default_delete()); + auto buffer_in = std::shared_ptr(new char[input_size], + std::default_delete()); // load input std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary); if (in_file.is_open()) { - in_file.read(reinterpret_cast(buffer_in.get()), - input_size * sizeof(float)); + in_file.read(buffer_in.get(), input_size); in_file.close(); } else { LOG(INFO) << "Open input file failed"; @@ -354,12 +353,13 @@ int Main(int argc, char **argv) { } for (size_t i = 0; i < output_count; ++i) { + // only support float and int32, use char for generalization int64_t output_size = std::accumulate(output_shape_vec[i].begin(), - output_shape_vec[i].end(), 1, + output_shape_vec[i].end(), 4, std::multiplies()); - auto buffer_out = std::shared_ptr(new float[output_size], - std::default_delete()); + auto buffer_out = std::shared_ptr(new char[output_size], + std::default_delete()); outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i], buffer_out, output_data_formats[i]); diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc index 0136e3f130e679a13f3e50ccc86c6c4bfda1e4f6..7f86d0eb426d5c5834f9d498f9554c73a0602df0 100644 --- a/mace/core/memory_optimizer.cc +++ b/mace/core/memory_optimizer.cc @@ -33,7 +33,7 @@ namespace mace { bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) { static const std::unordered_set kReuseOp = { - "Reshape", "Identity", "Squeeze", "ExpandDims" + "Reshape", "Identity", "Squeeze" }; return kReuseOp.count(op_type) == 1; } diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc index 26f615d132421011207429be6cffc516751863bb..845f9ff6f83f3814a9d00face7e3573f134f5e14 100644 --- a/mace/examples/cli/example.cc +++ b/mace/examples/cli/example.cc @@ -267,6 +267,7 @@ bool RunModel(const std::vector &input_names, std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, std::multiplies()); inputs_size[input_names[i]] = input_size; + // Only support float and int32 data type auto buffer_in = std::shared_ptr(new float[input_size], std::default_delete()); inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in, @@ -277,6 +278,7 @@ bool RunModel(const std::vector &input_names, int64_t output_size = std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, std::multiplies()); + // Only support float and int32 data type auto buffer_out = std::shared_ptr(new float[output_size], std::default_delete()); outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 3476fef1486d9bf95d8dad01eb3f2f73a8520115..0a44eb97e3969329721e4aa7c95cc8340541b3b8 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -284,13 +284,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy( class MaceTensor::Impl { public: std::vector shape; - std::shared_ptr data; + std::shared_ptr data; DataFormat format; int64_t buffer_size; }; MaceTensor::MaceTensor(const std::vector &shape, - std::shared_ptr data, + std::shared_ptr data, const DataFormat format) { MACE_CHECK_NOTNULL(data.get()); MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC @@ -345,9 +345,21 @@ MaceTensor::~MaceTensor() = default; const std::vector &MaceTensor::shape() const { return impl_->shape; } -const std::shared_ptr MaceTensor::data() const { return impl_->data; } +const std::shared_ptr MaceTensor::data() const { + return std::static_pointer_cast(impl_->data); +} + +std::shared_ptr MaceTensor::data() { + return std::static_pointer_cast(impl_->data); +} + +std::shared_ptr MaceTensor::raw_data() const { + return impl_->data; +} -std::shared_ptr MaceTensor::data() { return impl_->data; } +std::shared_ptr MaceTensor::raw_mutable_data() { + return impl_->data; +} DataFormat MaceTensor::data_format() const { return impl_->format; @@ -466,8 +478,9 @@ MaceStatus MaceEngine::Impl::Init( << "' does not belong to model's inputs: " << MakeString(MapKeys(input_info_map_)); } + DataType input_dt = input_info_map_[input_name].data_type(); Tensor *input_tensor = - ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT); + ws_->CreateTensor(input_name, device_->allocator(), input_dt); // Resize to possible largest shape to avoid resize during running. std::vector shape(input_info_map_[input_name].dims_size()); for (int i = 0; i < input_info_map_[input_name].dims_size(); ++i) { @@ -485,8 +498,9 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(output_info_map_)); } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + DataType output_dt = output_info_map_[output_name].data_type(); Tensor *output_tensor = - ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT); + ws_->CreateTensor(output_name, device_->allocator(), output_dt); output_tensor->set_data_format(NHWC); #endif } @@ -572,54 +586,71 @@ MaceStatus MaceEngine::Impl::TransposeInput( Tensor *input_tensor) { bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE; DataFormat data_format = DataFormat::DF_NONE; + DataType input_dt = input_tensor->dtype(); if (has_data_format) { + std::vector dst_dims; if (device_->device_type() == DeviceType::CPU && input.second.shape().size() == 4 && input.second.data_format() == NHWC && !is_quantized_model_) { VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; input_tensor->set_data_format(DataFormat::NCHW); - std::vector dst_dims = {0, 3, 1, 2}; - std::vector output_shape = - TransposeShape(input.second.shape(), dst_dims); - MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); + dst_dims = {0, 3, 1, 2}; } else if ( (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && input.second.shape().size() == 4 && input.second.data_format() == DataFormat::NCHW) { VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; - std::vector dst_dims = {0, 2, 3, 1}; input_tensor->set_data_format(DataFormat::NHWC); + dst_dims = {0, 2, 3, 1}; + } + if (!dst_dims.empty()) { std::vector output_shape = TransposeShape(input.second.shape(), dst_dims); MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - return ops::Transpose(input.second.data().get(), - input.second.shape(), - dst_dims, - input_data); + if (input_dt == DataType::DT_FLOAT) { + auto input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data, + input_dt); + } else if (input_dt == DataType::DT_INT32) { + auto input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data, + input_dt); + } else { + LOG(FATAL) << "MACE do not support the input data type: " << input_dt; + } } + data_format = input.second.data_format(); } input_tensor->set_data_format(data_format); MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - memcpy(input_data, input.second.data().get(), - input_tensor->size() * sizeof(float)); + if (input_dt == DataType::DT_FLOAT) { + auto input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(float)); + } else if (input_dt == DataType::DT_INT32) { + auto input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(int)); + } else { + LOG(FATAL) << "MACE do not support the input data type: " << input_dt; + } return MaceStatus::MACE_SUCCESS; } MaceStatus MaceEngine::Impl::TransposeOutput( const mace::Tensor *output_tensor, std::pair *output) { + DataType output_dt = output_tensor->dtype(); // save output if (output_tensor != nullptr && output->second.data() != nullptr) { if (output_tensor->data_format() != DataFormat::DF_NONE && @@ -655,11 +686,23 @@ MaceStatus MaceEngine::Impl::TransposeOutput( << output->second.impl_->buffer_size; output->second.impl_->shape = shape; Tensor::MappingGuard output_guard(output_tensor); - const float *output_data = output_tensor->data(); - return ops::Transpose(output_data, - output_tensor->shape(), - dst_dims, - output->second.data().get()); + if (output_dt == DataType::DT_FLOAT) { + auto output_data = output_tensor->data(); + return ops::Transpose(output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); + } else if (output_dt == DataType::DT_INT32) { + auto output_data = output_tensor->data(); + return ops::Transpose(output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get(), + output_dt); + } else { + LOG(FATAL) << "MACE do not support the output data type: " << output_dt; + return MaceStatus::MACE_INVALID_ARGS; + } } else { Tensor::MappingGuard output_guard(output_tensor); auto shape = output_tensor->shape(); @@ -670,8 +713,17 @@ MaceStatus MaceEngine::Impl::TransposeOutput( << MakeString(shape) << " vs buffer size " << output->second.impl_->buffer_size; output->second.impl_->shape = shape; - std::memcpy(output->second.data().get(), output_tensor->data(), - output_size * sizeof(float)); + if (output_dt == DataType::DT_FLOAT) { + std::memcpy(output->second.data().get(), + output_tensor->data(), + output_size * sizeof(float)); + } else if (output_dt == DataType::DT_INT32) { + std::memcpy(output->second.data().get(), + output_tensor->data(), + output_size * sizeof(int)); + } else { + LOG(FATAL) << "MACE do not support the output data type: " << output_dt; + } return MaceStatus::MACE_SUCCESS; } } else { diff --git a/mace/ops/common/transpose.cc b/mace/ops/common/transpose.cc index 469456a1c4424445ba836261c0f9bd71db878155..79a7a6be064368f34864fee115af6d7735b50a83 100644 --- a/mace/ops/common/transpose.cc +++ b/mace/ops/common/transpose.cc @@ -14,19 +14,14 @@ #include "mace/ops/common/transpose.h" -#include - #if defined(MACE_ENABLE_NEON) #include #endif -#include "mace/core/types.h" -#include "mace/utils/logging.h" - namespace mace { namespace ops { -namespace { +namespace transpose { void TransposeNHWCToNCHWC3(const float *input, float *output, const index_t height, @@ -100,119 +95,44 @@ void TransposeNCHWToNHWCC2(const float *input, #endif } } -} // namespace -MaceStatus Transpose(const float *input, - const std::vector &input_shape, - const std::vector &dst_dims, - float *output) { - MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || - (input_shape.size() == 4 && dst_dims.size() == 4), - "Only support 2D or 4D transpose"); +void TransposeNHWCToNCHWC3(const int *input, + int *output, + const index_t height, + const index_t width) { + index_t image_size = height * width; - std::vector output_shape; - for (size_t i = 0; i < dst_dims.size(); ++i) { - output_shape.push_back(input_shape[dst_dims[i]]); - } +#pragma omp parallel for + for (index_t h = 0; h < height; ++h) { + index_t in_offset = h * width * 3; + index_t out_offset = h * width; - if (input_shape.size() == 2) { - MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform"); - index_t height = input_shape[0]; - index_t width = input_shape[1]; - index_t stride_i = height; - index_t stride_j = width; - index_t tile_size = height > 512 || width > 512 ? 64 : 32; -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < height; i += tile_size) { - for (index_t j = 0; j < width; j += tile_size) { - index_t end_i = std::min(i + tile_size, height); - index_t end_j = std::min(j + tile_size, width); - for (index_t tile_i = i; tile_i < end_i; ++tile_i) { - for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - output[tile_j * stride_i + tile_i] = - input[tile_i * stride_j + tile_j]; - } - } + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 3; ++c) { + output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c]; } } - } else if (input_shape.size() == 4) { - std::vector transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2}; - std::vector transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1}; - index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3]; - - if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) { - for (index_t b = 0; b < input_shape[0]; ++b) { - TransposeNHWCToNCHWC3(input + b * batch_size, - output + b * batch_size, - input_shape[1], - input_shape[2]); - } - } else if (dst_dims == transpose_order_from_NCHW_to_NHWC - && input_shape[1] == 2) { - for (index_t b = 0; b < input_shape[0]; ++b) { - TransposeNCHWToNHWCC2(input + b * batch_size, - output + b * batch_size, - input_shape[2], - input_shape[3]); - } - } else if (dst_dims == std::vector{0, 2, 1, 3}) { - index_t height = input_shape[1]; - index_t width = input_shape[2]; - index_t channel = input_shape[3]; - index_t channel_raw_size = channel * sizeof(float); - index_t stride_i = height; - index_t stride_j = width; - index_t tile_size = std::max(static_cast(1), - static_cast(std::sqrt( - 8 * 1024 / channel))); -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < height; i += tile_size) { - for (index_t j = 0; j < width; j += tile_size) { - index_t end_i = std::min(i + tile_size, height); - index_t end_j = std::min(j + tile_size, width); - for (index_t tile_i = i; tile_i < end_i; ++tile_i) { - for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - memcpy(output + (tile_j * stride_i + tile_i) * channel, - input + (tile_i * stride_j + tile_j) * channel, - channel_raw_size); - } - } - } - } - } else { - std::vector - in_stride{input_shape[1] * input_shape[2] * input_shape[3], - input_shape[2] * input_shape[3], input_shape[3], 1}; - std::vector - out_stride{output_shape[1] * output_shape[2] * output_shape[3], - output_shape[2] * output_shape[3], output_shape[3], 1}; + } +} - std::vector idim(4, 0); - std::vector odim(4, 0); - for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) { - for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) { - for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) { - for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) { - idim[dst_dims[0]] = odim[0]; - idim[dst_dims[1]] = odim[1]; - idim[dst_dims[2]] = odim[2]; - idim[dst_dims[3]] = odim[3]; +void TransposeNCHWToNHWCC2(const int *input, + int *output, + const index_t height, + const index_t width) { + index_t image_size = height * width; +#pragma omp parallel for + for (index_t h = 0; h < height; ++h) { + index_t in_offset = h * width; + index_t out_offset = h * width * 2; - output[odim[0] * out_stride[0] + odim[1] * out_stride[1] - + odim[2] * out_stride[2] + odim[3]] = - input[idim[0] * in_stride[0] + idim[1] * in_stride[1] - + idim[2] * in_stride[2] + idim[3]]; - } - } - } + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < 2; ++c) { + output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w]; } } - } else { - MACE_NOT_IMPLEMENTED; } - - return MaceStatus::MACE_SUCCESS; } +} // namespace transpose } // namespace ops } // namespace mace diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h index 5f8e23490698ab71439d2486bc32d269e8d5ee0b..4d2e5a519e680276884fb95ad6edf088738c99d0 100644 --- a/mace/ops/common/transpose.h +++ b/mace/ops/common/transpose.h @@ -15,17 +15,154 @@ #ifndef MACE_OPS_COMMON_TRANSPOSE_H_ #define MACE_OPS_COMMON_TRANSPOSE_H_ +#include #include #include "mace/public/mace.h" +#include "mace/core/tensor.h" namespace mace { namespace ops { +namespace transpose { -MaceStatus Transpose(const float *input, +void TransposeNHWCToNCHWC3(const float *input, + float *output, + const index_t height, + const index_t width); + +void TransposeNHWCToNCHWC3(const int *input, + int *output, + const index_t height, + const index_t width); + +void TransposeNCHWToNHWCC2(const float *input, + float *output, + const index_t height, + const index_t width); + +void TransposeNCHWToNHWCC2(const int *input, + int *output, + const index_t height, + const index_t width); +} // namespace transpose + +template +MaceStatus Transpose(const T *input, const std::vector &input_shape, const std::vector &dst_dims, - float *output); + T *output, + DataType data_type = DataType::DT_FLOAT) { + MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || + (input_shape.size() == 4 && dst_dims.size() == 4), + "Only support 2D or 4D transpose"); + + std::vector output_shape; + for (size_t i = 0; i < dst_dims.size(); ++i) { + output_shape.push_back(input_shape[dst_dims[i]]); + } + + if (input_shape.size() == 2) { + MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform"); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t stride_i = height; + index_t stride_j = width; + index_t tile_size = height > 512 || width > 512 ? 64 : 32; +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < height; i += tile_size) { + for (index_t j = 0; j < width; j += tile_size) { + index_t end_i = std::min(i + tile_size, height); + index_t end_j = std::min(j + tile_size, width); + for (index_t tile_i = i; tile_i < end_i; ++tile_i) { + for (index_t tile_j = j; tile_j < end_j; ++tile_j) { + output[tile_j * stride_i + tile_i] = + input[tile_i * stride_j + tile_j]; + } + } + } + } + } else if (input_shape.size() == 4) { + std::vector transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2}; + std::vector transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1}; + index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3]; + bool supported_dt = (data_type == DataType::DT_FLOAT || + data_type == DataType::DT_INT32); + + if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 && + supported_dt) { + for (index_t b = 0; b < input_shape[0]; ++b) { + transpose::TransposeNHWCToNCHWC3(input + b * batch_size, + output + b * batch_size, + input_shape[1], + input_shape[2]); + } + } else if (dst_dims == transpose_order_from_NCHW_to_NHWC + && input_shape[1] == 2 && supported_dt) { + for (index_t b = 0; b < input_shape[0]; ++b) { + transpose::TransposeNCHWToNHWCC2(input + b * batch_size, + output + b * batch_size, + input_shape[2], + input_shape[3]); + } + } else if (dst_dims == std::vector{0, 2, 1, 3}) { + index_t height = input_shape[1]; + index_t width = input_shape[2]; + index_t channel = input_shape[3]; + index_t channel_raw_size = channel * sizeof(T); + index_t stride_i = height; + index_t stride_j = width; + index_t tile_size = std::max(static_cast(1), + static_cast(std::sqrt( + 8 * 1024 / channel))); +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < height; i += tile_size) { + for (index_t j = 0; j < width; j += tile_size) { + index_t end_i = std::min(i + tile_size, height); + index_t end_j = std::min(j + tile_size, width); + for (index_t tile_i = i; tile_i < end_i; ++tile_i) { + for (index_t tile_j = j; tile_j < end_j; ++tile_j) { + memcpy(output + (tile_j * stride_i + tile_i) * channel, + input + (tile_i * stride_j + tile_j) * channel, + channel_raw_size); + } + } + } + } + } else { + std::vector + in_stride{input_shape[1] * input_shape[2] * input_shape[3], + input_shape[2] * input_shape[3], input_shape[3], 1}; + std::vector + out_stride{output_shape[1] * output_shape[2] * output_shape[3], + output_shape[2] * output_shape[3], output_shape[3], 1}; + + std::vector idim(4, 0); + std::vector odim(4, 0); + for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) { + for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) { + for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) { + for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) { + idim[dst_dims[0]] = odim[0]; + idim[dst_dims[1]] = odim[1]; + idim[dst_dims[2]] = odim[2]; + idim[dst_dims[3]] = odim[3]; + + output[odim[0] * out_stride[0] + odim[1] * out_stride[1] + + odim[2] * out_stride[2] + odim[3]] = + input[idim[0] * in_stride[0] + idim[1] * in_stride[1] + + idim[2] * in_stride[2] + idim[3]]; + } + } + } + } + } + } else { + MACE_NOT_IMPLEMENTED; + } + + return MaceStatus::MACE_SUCCESS; +} + } // namespace ops } // namespace mace diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc index 5d7ad1bade6fe8f0dbdbfebedbea892c9d354b28..2d99d7a742659549c750fc1246449f35701f2277 100644 --- a/mace/ops/expand_dims.cc +++ b/mace/ops/expand_dims.cc @@ -14,6 +14,8 @@ #include "mace/core/operator.h" +#include "mace/ops/common/transpose.h" +#include "mace/utils/math.h" namespace mace { namespace ops { @@ -33,21 +35,35 @@ class ExpandDimsOp : public Operation { const Tensor *input = this->Input(0); Tensor *output = this->Output(0); index_t input_dims_size = input->dim_size(); - if ( axis_ < 0 ) { + if (axis_ < 0) { axis_ += input_dims_size + 1; } MACE_CHECK(axis_ >= 0 && axis_ <= input_dims_size, "axis is out of bound: ", axis_); const std::vector input_shape = input->shape(); - std::vector output_shape; - output_shape.insert(output_shape.end(), input_shape.begin(), - input_shape.begin() + axis_); - output_shape.insert(output_shape.end(), 1); - output_shape.insert(output_shape.end(), input_shape.begin() + axis_, - input_shape.end()); + std::vector output_shape(input_shape); + output_shape.insert(output_shape.begin() + axis_, 1); - output->ReuseTensorBuffer(*input); - output->Reshape(output_shape); + bool has_data_format = Operation::GetOptionalArg( + "has_data_format", 0) == 1; + if (has_data_format && output_shape.size() == 4) { + // only tensorflow support expand dim, so the default format is NHWC + // transform NHWC to NCHW + auto t_output_shape = TransposeShape(output_shape, + {0, 3, 1, 2}); + output->Resize(t_output_shape); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + + Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data); + } else { + output->Resize(output_shape); + Tensor::MappingGuard input_guard(input); + auto input_data = input->data(); + output->Copy(input_data, input->size()); + } return MaceStatus::MACE_SUCCESS; } @@ -62,11 +78,6 @@ void RegisterExpandDims(OpRegistryBase *op_registry) { MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, DeviceType::CPU, int32_t); - -#ifdef MACE_ENABLE_QUANTIZE - MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp, - DeviceType::CPU, uint8_t); -#endif // MACE_ENABLE_QUANTIZE } } // namespace ops diff --git a/mace/public/mace.h b/mace/public/mace.h index c265401ed3ca3f0eb88a51ed03ab206aa2c7c2b3..8cc251132d9d2ee26ecf70b2684e7eee25f50f15 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -326,7 +326,7 @@ class MACE_API MaceTensor { // of shared_ptr and manage the life cycle of the buffer by yourself. // For example, std::shared_ptr(raw_buffer, [](float *){}); MaceTensor(const std::vector &shape, - std::shared_ptr data, + std::shared_ptr data, const DataFormat format = DataFormat::NHWC); MaceTensor(); MaceTensor(const MaceTensor &other); @@ -339,8 +339,20 @@ class MACE_API MaceTensor { const std::vector &shape() const; const std::shared_ptr data() const; std::shared_ptr data(); + template + const std::shared_ptr data() const { + return std::static_pointer_cast(raw_data()); + } + template + std::shared_ptr data() { + return std::static_pointer_cast(raw_mutable_data()); + } DataFormat data_format() const; + private: + std::shared_ptr raw_data() const; + std::shared_ptr raw_mutable_data(); + private: class Impl; std::unique_ptr impl_; diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index 0de68ce4f6af1c0ae6c995e77738015b998dafba..446321a447703414ba00e51d74745c5df635ee69 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -47,6 +47,11 @@ data_format_map = { 'OIHW': cvt.DataFormat.OIHW, } +data_type_map = { + 'float32': mace_pb2.DT_FLOAT, + 'int32': mace_pb2.DT_INT32, +} + def parse_data_type(data_type, device_type): if device_type == cvt.DeviceType.CPU.value or \ @@ -141,6 +146,7 @@ def main(unused_args): option.data_type = parse_data_type(FLAGS.data_type, option.device) input_node_names = FLAGS.input_node.split(',') + input_data_types = FLAGS.input_data_types.split(',') input_node_shapes = FLAGS.input_shape.split(':') input_node_formats = FLAGS.input_data_formats.split(",") if FLAGS.input_range: @@ -152,10 +158,8 @@ def main(unused_args): for i in six.moves.range(len(input_node_names)): input_node = cvt.NodeInfo() input_node.name = input_node_names[i] - if len(input_node_formats) == 1: - input_node.data_format = data_format_map[input_node_formats[0]] - else: - input_node.data_format = data_format_map[input_node_formats[i]] + input_node.data_type = data_type_map[input_data_types[i]] + input_node.data_format = data_format_map[input_node_formats[i]] input_node.shape = parse_int_array_from_str(input_node_shapes[i]) if input_node.data_format == cvt.DataFormat.NCHW and\ len(input_node.shape) == 4: @@ -166,6 +170,7 @@ def main(unused_args): option.add_input_node(input_node) output_node_names = FLAGS.output_node.split(',') + output_data_types = FLAGS.output_data_types.split(',') output_node_shapes = FLAGS.output_shape.split(':') output_node_formats = FLAGS.output_data_formats.split(",") if len(output_node_names) != len(output_node_shapes): @@ -173,10 +178,8 @@ def main(unused_args): for i in six.moves.range(len(output_node_names)): output_node = cvt.NodeInfo() output_node.name = output_node_names[i] - if len(output_node_formats) == 1: - output_node.data_format = data_format_map[output_node_formats[0]] - else: - output_node.data_format = data_format_map[output_node_formats[i]] + output_node.data_type = data_type_map[output_data_types[i]] + output_node.data_format = data_format_map[output_node_formats[i]] output_node.shape = parse_int_array_from_str(output_node_shapes[i]) if output_node.data_format == cvt.DataFormat.NCHW and\ len(output_node.shape) == 4: @@ -290,6 +293,11 @@ def parse_args(): type=str, default="input_node", help="e.g., input_node") + parser.add_argument( + "--input_data_types", + type=str, + default="float32", + help="e.g., float32|int32") parser.add_argument( "--input_data_formats", type=str, @@ -297,6 +305,11 @@ def parse_args(): help="e.g., NHWC,NONE") parser.add_argument( "--output_node", type=str, default="softmax", help="e.g., softmax") + parser.add_argument( + "--output_data_types", + type=str, + default="float32", + help="e.g., float32|int32") parser.add_argument( "--output_data_formats", type=str, diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 7fc877d662a90bc4d6030daab3843b27cb801f80..4ed7156a089a6568bfeb98a4cd40c2e87c7fc67c 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -298,6 +298,7 @@ class NodeInfo(object): def __init__(self): self._name = None + self._data_type = mace_pb2.DT_FLOAT self._shape = [] self._data_format = DataFormat.NHWC self._range = [-1.0, 1.0] @@ -306,6 +307,10 @@ class NodeInfo(object): def name(self): return self._name + @property + def data_type(self): + return self._data_type + @property def shape(self): return self._shape @@ -322,6 +327,10 @@ class NodeInfo(object): def name(self, name): self._name = name + @data_type.setter + def data_type(self, data_type): + self._data_type = data_type + @shape.setter def shape(self, shape): self._shape = shape diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py index ec255e3a90296a04d8538c1ff464edb097fe5193..53d57151f02ecd82f4a1d0504fc957542c1011e7 100644 --- a/mace/python/tools/converter_tool/tensorflow_converter.py +++ b/mace/python/tools/converter_tool/tensorflow_converter.py @@ -102,6 +102,7 @@ TFSupportedOps = [ 'Mean', 'Const', 'Gather', + 'GatherV2', 'StridedSlice', 'Slice', 'ReverseV2', @@ -241,6 +242,7 @@ class TensorflowConverter(base_converter.ConverterInterface): TFOpType.Mean.name: self.convert_mean, TFOpType.Const.name: self.convert_nop, TFOpType.Gather.name: self.convert_gather, + TFOpType.GatherV2.name: self.convert_gather, TFOpType.StridedSlice.name: self.convert_stridedslice, TFOpType.Slice.name: self.convert_slice, TFOpType.ReverseV2.name: self.convert_reverse, @@ -838,16 +840,11 @@ class TensorflowConverter(base_converter.ConverterInterface): op = self.convert_general_op(tf_op) op.type = MaceOp.ExpandDims.name + axis_value = tf_op.inputs[1].eval().astype(np.int32) axis_arg = op.arg.add() axis_arg.name = MaceKeyword.mace_axis_str - try: - axis_value = tf_op.get_attr('dim') - except ValueError: - try: - axis_value = tf_op.get_attr('axis') - except ValueError: - axis_value = 0 axis_arg.i = axis_value + del op.input[1] def convert_squeeze(self, tf_op): op = self.convert_general_op(tf_op) diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 1083e23545767725e2f4e0d9c394d790fd5d0dd3..8fd513e882715757754e3a6997991a67fd4c1cdb 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -323,7 +323,7 @@ class Transformer(base_converter.ConverterInterface): input_info.name = input_node.name input_info.data_format = input_node.data_format.value input_info.dims.extend(input_node.shape) - input_info.data_type = mace_pb2.DT_FLOAT + input_info.data_type = input_node.data_type output_nodes = self._option.check_nodes.values() for output_node in output_nodes: @@ -332,7 +332,7 @@ class Transformer(base_converter.ConverterInterface): output_info.data_format = output_node.data_format.value output_info.dims.extend( self._producer[output_node.name].output_shape[0].dims) - output_info.data_type = mace_pb2.DT_FLOAT + output_info.data_type = output_node.data_type return False diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 0653304fde80b275217eba9332ab4a121c169a9a..8d1c0e28164ef4a375c7308aeca72cb5b22e0534 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -317,17 +317,18 @@ bool RunModel(const std::string &model_name, std::map outputs; for (size_t i = 0; i < input_count; ++i) { // Allocate input and output + // only support float and int32, use char for generalization + // sizeof(int) == 4, sizeof(float) == 4 int64_t input_size = - std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1, + std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 4, std::multiplies()); - auto buffer_in = std::shared_ptr(new float[input_size], - std::default_delete()); + auto buffer_in = std::shared_ptr(new char[input_size], + std::default_delete()); // load input std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]), std::ios::in | std::ios::binary); if (in_file.is_open()) { - in_file.read(reinterpret_cast(buffer_in.get()), - input_size * sizeof(float)); + in_file.read(buffer_in.get(), input_size); in_file.close(); } else { LOG(INFO) << "Open input file failed"; @@ -338,11 +339,12 @@ bool RunModel(const std::string &model_name, } for (size_t i = 0; i < output_count; ++i) { + // only support float and int32, use char for generalization int64_t output_size = - std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4, std::multiplies()); - auto buffer_out = std::shared_ptr(new float[output_size], - std::default_delete()); + auto buffer_out = std::shared_ptr(new char[output_size], + std::default_delete()); outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out, output_data_formats[i]); } @@ -454,12 +456,12 @@ bool RunModel(const std::string &model_name, std::string output_name = FLAGS_output_file + "_" + FormatName(output_names[i]); std::ofstream out_file(output_name, std::ios::binary); + // only support float and int32 int64_t output_size = - std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1, + std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4, std::multiplies()); out_file.write( - reinterpret_cast(outputs[output_names[i]].data().get()), - output_size * sizeof(float)); + outputs[output_names[i]].data().get(), output_size); out_file.flush(); out_file.close(); LOG(INFO) << "Write output file " << output_name << " with size " @@ -524,6 +526,7 @@ int Main(int argc, char **argv) { // get cpu capability Capability cpu_capability = GetCapability(DeviceType::CPU); + float cpu_float32_performance = cpu_capability.float32_performance.exec_time; bool ret = false; for (int i = 0; i < FLAGS_restart_round; ++i) { @@ -531,7 +534,7 @@ int Main(int argc, char **argv) { ret = RunModel(FLAGS_model_name, input_names, input_shape_vec, input_data_formats, output_names, output_shape_vec, output_data_formats, - cpu_capability.float32_performance.exec_time); + cpu_float32_performance); } if (ret) { return 0; diff --git a/tools/common.py b/tools/common.py index 82a25e5d5e6c04c1db474f93cf7dd21c3d1d48d3..0884319ff9f369c0d05271141e16935cdbf57a56 100644 --- a/tools/common.py +++ b/tools/common.py @@ -397,6 +397,7 @@ class YAMLKeyword(object): runtime = 'runtime' data_type = 'data_type' input_data_types = 'input_data_types' + output_data_types = 'output_data_types' input_data_formats = 'input_data_formats' output_data_formats = 'output_data_formats' limit_opencl_kernel_time = 'limit_opencl_kernel_time' diff --git a/tools/converter.py b/tools/converter.py index 7422a4b52fdad97567abdd1d6b962221ff1aecde..5d8c0c5fcdf718f9f4a91e0b7849aab2e5e2d80a 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -65,13 +65,13 @@ RuntimeTypeStrs = [ "cpu+gpu" ] -InputDataTypeStrs = [ +InOutDataTypeStrs = [ "int32", "float32", ] -InputDataType = Enum('InputDataType', - [(ele, ele) for ele in InputDataTypeStrs], +InOutDataType = Enum('InputDataType', + [(ele, ele) for ele in InOutDataTypeStrs], type=str) FPDataTypeStrs = [ @@ -410,17 +410,23 @@ def format_model_config(flags): else: subgraph[key] = [] - input_data_types = subgraph.get(YAMLKeyword.input_data_types, "") - if input_data_types: - if not isinstance(input_data_types, list): - subgraph[YAMLKeyword.input_data_types] = [input_data_types] - for input_data_type in subgraph[YAMLKeyword.input_data_types]: - mace_check(input_data_type in InputDataTypeStrs, - ModuleName.YAML_CONFIG, - "'input_data_types' must be in " - + str(InputDataTypeStrs)) - else: - subgraph[YAMLKeyword.input_data_types] = [] + for key in [YAMLKeyword.input_data_types, + YAMLKeyword.output_data_types]: + if key == YAMLKeyword.input_data_types: + count = input_size + else: + count = output_size + data_types = subgraph.get(key, "") + if data_types: + if not isinstance(data_types, list): + subgraph[key] = [data_types] * count + for data_type in subgraph[key]: + mace_check(data_type in InOutDataTypeStrs, + ModuleName.YAML_CONFIG, + key + " must be in " + + str(InOutDataTypeStrs)) + else: + subgraph[key] = [InOutDataType.float32] * count input_data_formats = subgraph.get(YAMLKeyword.input_data_formats, []) @@ -722,8 +728,10 @@ def convert_model(configs, cl_mem_type): model_config[YAMLKeyword.model_sha256_checksum], model_config[YAMLKeyword.weight_sha256_checksum], ",".join(subgraphs[0][YAMLKeyword.input_tensors]), + ",".join(subgraphs[0][YAMLKeyword.input_data_types]), ",".join(subgraphs[0][YAMLKeyword.input_data_formats]), ",".join(subgraphs[0][YAMLKeyword.output_tensors]), + ",".join(subgraphs[0][YAMLKeyword.output_data_types]), ",".join(subgraphs[0][YAMLKeyword.output_data_formats]), ",".join(subgraphs[0][YAMLKeyword.check_tensors]), runtime, diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 02b8ffbaee2dcf2d979432fab3195a07b5a40591..969ceda6405cc25e8a41c7e6b1d803d414ff4f5b 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -480,8 +480,10 @@ def gen_model_code(model_codegen_dir, model_sha256_checksum, weight_sha256_checksum, input_nodes, + input_data_types, input_data_formats, output_nodes, + output_data_types, output_data_formats, check_nodes, runtime, @@ -515,8 +517,10 @@ def gen_model_code(model_codegen_dir, "--model_checksum=%s" % model_sha256_checksum, "--weight_checksum=%s" % weight_sha256_checksum, "--input_node=%s" % input_nodes, + "--input_data_types=%s" % input_data_types, "--input_data_formats=%s" % input_data_formats, "--output_node=%s" % output_nodes, + "--output_data_types=%s" % output_data_types, "--output_data_formats=%s" % output_data_formats, "--check_node=%s" % check_nodes, "--runtime=%s" % runtime,