提交 80d1c9dd 编写于 作者: L liuqi

Support int32 input data type.

1. Support int32 input data type.
2. Support GatherV2 op
3. Add transpose to ExpandDim op.
上级 ad4953cb
......@@ -114,7 +114,7 @@ jobs:
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_test" --run_target=False --target_abis=armeabi-v7a || exit 1
- python tools/bazel_adb_run.py --target="//mace/test:mace_api_mt_test" --run_target=False --target_abis=armeabi-v7a || exit 1
- echo 'Extra Test'
- python tools/bazel_adb_run.py --target="//mace/utils:tuner_test" --run_target=False --target_abis=armeabi-v7a || exit 1
- python tools/bazel_adb_run.py --target="//mace/utils:utils_test" --run_target=False --target_abis=armeabi-v7a || exit 1
env: TYPE=Extra-Test-ARMEABI-v7a
os: linux
dist: xenial
......
......@@ -332,18 +332,17 @@ int Main(int argc, char **argv) {
std::map<std::string, mace::MaceTensor> inputs;
std::map<std::string, mace::MaceTensor> outputs;
for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output
// only support float and int32, use char for generalization
int64_t input_size =
std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4,
std::multiplies<int64_t>());
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
auto buffer_in = std::shared_ptr<char>(new char[input_size],
std::default_delete<char[]>());
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(buffer_in.get()),
input_size * sizeof(float));
in_file.read(buffer_in.get(), input_size);
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
......@@ -354,12 +353,13 @@ int Main(int argc, char **argv) {
}
for (size_t i = 0; i < output_count; ++i) {
// only support float and int32, use char for generalization
int64_t output_size =
std::accumulate(output_shape_vec[i].begin(),
output_shape_vec[i].end(), 1,
output_shape_vec[i].end(), 4,
std::multiplies<int64_t>());
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
auto buffer_out = std::shared_ptr<char>(new char[output_size],
std::default_delete<char[]>());
outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
buffer_out,
output_data_formats[i]);
......
......@@ -33,7 +33,7 @@ namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze", "ExpandDims"
"Reshape", "Identity", "Squeeze"
};
return kReuseOp.count(op_type) == 1;
}
......
......@@ -267,6 +267,7 @@ bool RunModel(const std::vector<std::string> &input_names,
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
std::multiplies<int64_t>());
inputs_size[input_names[i]] = input_size;
// Only support float and int32 data type
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
......@@ -277,6 +278,7 @@ bool RunModel(const std::vector<std::string> &input_names,
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::multiplies<int64_t>());
// Only support float and int32 data type
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
......
......@@ -284,13 +284,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
class MaceTensor::Impl {
public:
std::vector<int64_t> shape;
std::shared_ptr<float> data;
std::shared_ptr<void> data;
DataFormat format;
int64_t buffer_size;
};
MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data,
std::shared_ptr<void> data,
const DataFormat format) {
MACE_CHECK_NOTNULL(data.get());
MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
......@@ -345,9 +345,21 @@ MaceTensor::~MaceTensor() = default;
const std::vector<int64_t> &MaceTensor::shape() const { return impl_->shape; }
const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
const std::shared_ptr<float> MaceTensor::data() const {
return std::static_pointer_cast<float>(impl_->data);
}
std::shared_ptr<float> MaceTensor::data() {
return std::static_pointer_cast<float>(impl_->data);
}
std::shared_ptr<void> MaceTensor::raw_data() const {
return impl_->data;
}
std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
std::shared_ptr<void> MaceTensor::raw_mutable_data() {
return impl_->data;
}
DataFormat MaceTensor::data_format() const {
return impl_->format;
......@@ -466,8 +478,9 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_));
}
DataType input_dt = input_info_map_[input_name].data_type();
Tensor *input_tensor =
ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
ws_->CreateTensor(input_name, device_->allocator(), input_dt);
// Resize to possible largest shape to avoid resize during running.
std::vector<index_t> shape(input_info_map_[input_name].dims_size());
for (int i = 0; i < input_info_map_[input_name].dims_size(); ++i) {
......@@ -485,8 +498,9 @@ MaceStatus MaceEngine::Impl::Init(
<< MakeString(MapKeys(output_info_map_));
}
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
DataType output_dt = output_info_map_[output_name].data_type();
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
ws_->CreateTensor(output_name, device_->allocator(), output_dt);
output_tensor->set_data_format(NHWC);
#endif
}
......@@ -572,54 +586,71 @@ MaceStatus MaceEngine::Impl::TransposeInput(
Tensor *input_tensor) {
bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
DataFormat data_format = DataFormat::DF_NONE;
DataType input_dt = input_tensor->dtype();
if (has_data_format) {
std::vector<int> dst_dims;
if (device_->device_type() == DeviceType::CPU &&
input.second.shape().size() == 4 &&
input.second.data_format() == NHWC &&
!is_quantized_model_) {
VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
input_tensor->set_data_format(DataFormat::NCHW);
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
dst_dims = {0, 3, 1, 2};
} else if (
(is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
input.second.shape().size() == 4 &&
input.second.data_format() == DataFormat::NCHW) {
VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
input_tensor->set_data_format(DataFormat::NHWC);
dst_dims = {0, 2, 3, 1};
}
if (!dst_dims.empty()) {
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data<float>().get(),
input.second.shape(),
dst_dims,
input_data,
input_dt);
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
return ops::Transpose(input.second.data<int>().get(),
input.second.shape(),
dst_dims,
input_data,
input_dt);
} else {
LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
}
}
data_format = input.second.data_format();
}
input_tensor->set_data_format(data_format);
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
if (input_dt == DataType::DT_FLOAT) {
auto input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
} else if (input_dt == DataType::DT_INT32) {
auto input_data = input_tensor->mutable_data<int>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(int));
} else {
LOG(FATAL) << "MACE do not support the input data type: " << input_dt;
}
return MaceStatus::MACE_SUCCESS;
}
MaceStatus MaceEngine::Impl::TransposeOutput(
const mace::Tensor *output_tensor,
std::pair<const std::string, mace::MaceTensor> *output) {
DataType output_dt = output_tensor->dtype();
// save output
if (output_tensor != nullptr && output->second.data() != nullptr) {
if (output_tensor->data_format() != DataFormat::DF_NONE &&
......@@ -655,11 +686,23 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
<< output->second.impl_->buffer_size;
output->second.impl_->shape = shape;
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
if (output_dt == DataType::DT_FLOAT) {
auto output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data<float>().get());
} else if (output_dt == DataType::DT_INT32) {
auto output_data = output_tensor->data<int>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data<int>().get(),
output_dt);
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
return MaceStatus::MACE_INVALID_ARGS;
}
} else {
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
......@@ -670,8 +713,17 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
<< MakeString<int64_t>(shape) << " vs buffer size "
<< output->second.impl_->buffer_size;
output->second.impl_->shape = shape;
std::memcpy(output->second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
if (output_dt == DataType::DT_FLOAT) {
std::memcpy(output->second.data<float>().get(),
output_tensor->data<float>(),
output_size * sizeof(float));
} else if (output_dt == DataType::DT_INT32) {
std::memcpy(output->second.data<int>().get(),
output_tensor->data<int>(),
output_size * sizeof(int));
} else {
LOG(FATAL) << "MACE do not support the output data type: " << output_dt;
}
return MaceStatus::MACE_SUCCESS;
}
} else {
......
......@@ -14,19 +14,14 @@
#include "mace/ops/common/transpose.h"
#include <algorithm>
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
#include "mace/utils/logging.h"
namespace mace {
namespace ops {
namespace {
namespace transpose {
void TransposeNHWCToNCHWC3(const float *input,
float *output,
const index_t height,
......@@ -100,119 +95,44 @@ void TransposeNCHWToNHWCC2(const float *input,
#endif
}
}
} // namespace
MaceStatus Transpose(const float *input,
const std::vector<int64_t> &input_shape,
const std::vector<int> &dst_dims,
float *output) {
MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
(input_shape.size() == 4 && dst_dims.size() == 4),
"Only support 2D or 4D transpose");
void TransposeNHWCToNCHWC3(const int *input,
int *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
std::vector<index_t> output_shape;
for (size_t i = 0; i < dst_dims.size(); ++i) {
output_shape.push_back(input_shape[dst_dims[i]]);
}
#pragma omp parallel for
for (index_t h = 0; h < height; ++h) {
index_t in_offset = h * width * 3;
index_t out_offset = h * width;
if (input_shape.size() == 2) {
MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = height > 512 || width > 512 ? 64 : 32;
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < height; i += tile_size) {
for (index_t j = 0; j < width; j += tile_size) {
index_t end_i = std::min(i + tile_size, height);
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
output[tile_j * stride_i + tile_i] =
input[tile_i * stride_j + tile_j];
}
}
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < 3; ++c) {
output[out_offset + c * image_size + w] = input[in_offset + w * 3 + c];
}
}
} else if (input_shape.size() == 4) {
std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) {
for (index_t b = 0; b < input_shape[0]; ++b) {
TransposeNHWCToNCHWC3(input + b * batch_size,
output + b * batch_size,
input_shape[1],
input_shape[2]);
}
} else if (dst_dims == transpose_order_from_NCHW_to_NHWC
&& input_shape[1] == 2) {
for (index_t b = 0; b < input_shape[0]; ++b) {
TransposeNCHWToNHWCC2(input + b * batch_size,
output + b * batch_size,
input_shape[2],
input_shape[3]);
}
} else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
index_t height = input_shape[1];
index_t width = input_shape[2];
index_t channel = input_shape[3];
index_t channel_raw_size = channel * sizeof(float);
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = std::max(static_cast<index_t>(1),
static_cast<index_t>(std::sqrt(
8 * 1024 / channel)));
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < height; i += tile_size) {
for (index_t j = 0; j < width; j += tile_size) {
index_t end_i = std::min(i + tile_size, height);
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
memcpy(output + (tile_j * stride_i + tile_i) * channel,
input + (tile_i * stride_j + tile_j) * channel,
channel_raw_size);
}
}
}
}
} else {
std::vector<index_t>
in_stride{input_shape[1] * input_shape[2] * input_shape[3],
input_shape[2] * input_shape[3], input_shape[3], 1};
std::vector<index_t>
out_stride{output_shape[1] * output_shape[2] * output_shape[3],
output_shape[2] * output_shape[3], output_shape[3], 1};
}
}
std::vector<index_t> idim(4, 0);
std::vector<index_t> odim(4, 0);
for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
idim[dst_dims[0]] = odim[0];
idim[dst_dims[1]] = odim[1];
idim[dst_dims[2]] = odim[2];
idim[dst_dims[3]] = odim[3];
void TransposeNCHWToNHWCC2(const int *input,
int *output,
const index_t height,
const index_t width) {
index_t image_size = height * width;
#pragma omp parallel for
for (index_t h = 0; h < height; ++h) {
index_t in_offset = h * width;
index_t out_offset = h * width * 2;
output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+ odim[2] * out_stride[2] + odim[3]] =
input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+ idim[2] * in_stride[2] + idim[3]];
}
}
}
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < 2; ++c) {
output[out_offset + w * 2 + c] = input[in_offset + c * image_size + w];
}
}
} else {
MACE_NOT_IMPLEMENTED;
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace transpose
} // namespace ops
} // namespace mace
......@@ -15,17 +15,154 @@
#ifndef MACE_OPS_COMMON_TRANSPOSE_H_
#define MACE_OPS_COMMON_TRANSPOSE_H_
#include <algorithm>
#include <vector>
#include "mace/public/mace.h"
#include "mace/core/tensor.h"
namespace mace {
namespace ops {
namespace transpose {
MaceStatus Transpose(const float *input,
void TransposeNHWCToNCHWC3(const float *input,
float *output,
const index_t height,
const index_t width);
void TransposeNHWCToNCHWC3(const int *input,
int *output,
const index_t height,
const index_t width);
void TransposeNCHWToNHWCC2(const float *input,
float *output,
const index_t height,
const index_t width);
void TransposeNCHWToNHWCC2(const int *input,
int *output,
const index_t height,
const index_t width);
} // namespace transpose
template <typename T>
MaceStatus Transpose(const T *input,
const std::vector<int64_t> &input_shape,
const std::vector<int> &dst_dims,
float *output);
T *output,
DataType data_type = DataType::DT_FLOAT) {
MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) ||
(input_shape.size() == 4 && dst_dims.size() == 4),
"Only support 2D or 4D transpose");
std::vector<index_t> output_shape;
for (size_t i = 0; i < dst_dims.size(); ++i) {
output_shape.push_back(input_shape[dst_dims[i]]);
}
if (input_shape.size() == 2) {
MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform");
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = height > 512 || width > 512 ? 64 : 32;
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < height; i += tile_size) {
for (index_t j = 0; j < width; j += tile_size) {
index_t end_i = std::min(i + tile_size, height);
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
output[tile_j * stride_i + tile_i] =
input[tile_i * stride_j + tile_j];
}
}
}
}
} else if (input_shape.size() == 4) {
std::vector<int> transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2};
std::vector<int> transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1};
index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3];
bool supported_dt = (data_type == DataType::DT_FLOAT ||
data_type == DataType::DT_INT32);
if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3 &&
supported_dt) {
for (index_t b = 0; b < input_shape[0]; ++b) {
transpose::TransposeNHWCToNCHWC3(input + b * batch_size,
output + b * batch_size,
input_shape[1],
input_shape[2]);
}
} else if (dst_dims == transpose_order_from_NCHW_to_NHWC
&& input_shape[1] == 2 && supported_dt) {
for (index_t b = 0; b < input_shape[0]; ++b) {
transpose::TransposeNCHWToNHWCC2(input + b * batch_size,
output + b * batch_size,
input_shape[2],
input_shape[3]);
}
} else if (dst_dims == std::vector<int>{0, 2, 1, 3}) {
index_t height = input_shape[1];
index_t width = input_shape[2];
index_t channel = input_shape[3];
index_t channel_raw_size = channel * sizeof(T);
index_t stride_i = height;
index_t stride_j = width;
index_t tile_size = std::max(static_cast<index_t>(1),
static_cast<index_t>(std::sqrt(
8 * 1024 / channel)));
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < height; i += tile_size) {
for (index_t j = 0; j < width; j += tile_size) {
index_t end_i = std::min(i + tile_size, height);
index_t end_j = std::min(j + tile_size, width);
for (index_t tile_i = i; tile_i < end_i; ++tile_i) {
for (index_t tile_j = j; tile_j < end_j; ++tile_j) {
memcpy(output + (tile_j * stride_i + tile_i) * channel,
input + (tile_i * stride_j + tile_j) * channel,
channel_raw_size);
}
}
}
}
} else {
std::vector<index_t>
in_stride{input_shape[1] * input_shape[2] * input_shape[3],
input_shape[2] * input_shape[3], input_shape[3], 1};
std::vector<index_t>
out_stride{output_shape[1] * output_shape[2] * output_shape[3],
output_shape[2] * output_shape[3], output_shape[3], 1};
std::vector<index_t> idim(4, 0);
std::vector<index_t> odim(4, 0);
for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
idim[dst_dims[0]] = odim[0];
idim[dst_dims[1]] = odim[1];
idim[dst_dims[2]] = odim[2];
idim[dst_dims[3]] = odim[3];
output[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+ odim[2] * out_stride[2] + odim[3]] =
input[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+ idim[2] * in_stride[2] + idim[3]];
}
}
}
}
}
} else {
MACE_NOT_IMPLEMENTED;
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace ops
} // namespace mace
......
......@@ -14,6 +14,8 @@
#include "mace/core/operator.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/math.h"
namespace mace {
namespace ops {
......@@ -33,21 +35,35 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
index_t input_dims_size = input->dim_size();
if ( axis_ < 0 ) {
if (axis_ < 0) {
axis_ += input_dims_size + 1;
}
MACE_CHECK(axis_ >= 0 && axis_ <= input_dims_size,
"axis is out of bound: ", axis_);
const std::vector<index_t> input_shape = input->shape();
std::vector<index_t> output_shape;
output_shape.insert(output_shape.end(), input_shape.begin(),
input_shape.begin() + axis_);
output_shape.insert(output_shape.end(), 1);
output_shape.insert(output_shape.end(), input_shape.begin() + axis_,
input_shape.end());
std::vector<index_t> output_shape(input_shape);
output_shape.insert(output_shape.begin() + axis_, 1);
output->ReuseTensorBuffer(*input);
output->Reshape(output_shape);
bool has_data_format = Operation::GetOptionalArg<int>(
"has_data_format", 0) == 1;
if (has_data_format && output_shape.size() == 4) {
// only tensorflow support expand dim, so the default format is NHWC
// transform NHWC to NCHW
auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
{0, 3, 1, 2});
output->Resize(t_output_shape);
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
Transpose(input_data, output_shape, {0, 3, 1, 2}, output_data);
} else {
output->Resize(output_shape);
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<T>();
output->Copy<T>(input_data, input->size());
}
return MaceStatus::MACE_SUCCESS;
}
......@@ -62,11 +78,6 @@ void RegisterExpandDims(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, int32_t);
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
}
} // namespace ops
......
......@@ -326,7 +326,7 @@ class MACE_API MaceTensor {
// of shared_ptr and manage the life cycle of the buffer by yourself.
// For example, std::shared_ptr<float>(raw_buffer, [](float *){});
MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data,
std::shared_ptr<void> data,
const DataFormat format = DataFormat::NHWC);
MaceTensor();
MaceTensor(const MaceTensor &other);
......@@ -339,8 +339,20 @@ class MACE_API MaceTensor {
const std::vector<int64_t> &shape() const;
const std::shared_ptr<float> data() const;
std::shared_ptr<float> data();
template <typename T>
const std::shared_ptr<T> data() const {
return std::static_pointer_cast<T>(raw_data());
}
template <typename T>
std::shared_ptr<T> data() {
return std::static_pointer_cast<T>(raw_mutable_data());
}
DataFormat data_format() const;
private:
std::shared_ptr<void> raw_data() const;
std::shared_ptr<void> raw_mutable_data();
private:
class Impl;
std::unique_ptr<Impl> impl_;
......
......@@ -47,6 +47,11 @@ data_format_map = {
'OIHW': cvt.DataFormat.OIHW,
}
data_type_map = {
'float32': mace_pb2.DT_FLOAT,
'int32': mace_pb2.DT_INT32,
}
def parse_data_type(data_type, device_type):
if device_type == cvt.DeviceType.CPU.value or \
......@@ -141,6 +146,7 @@ def main(unused_args):
option.data_type = parse_data_type(FLAGS.data_type, option.device)
input_node_names = FLAGS.input_node.split(',')
input_data_types = FLAGS.input_data_types.split(',')
input_node_shapes = FLAGS.input_shape.split(':')
input_node_formats = FLAGS.input_data_formats.split(",")
if FLAGS.input_range:
......@@ -152,10 +158,8 @@ def main(unused_args):
for i in six.moves.range(len(input_node_names)):
input_node = cvt.NodeInfo()
input_node.name = input_node_names[i]
if len(input_node_formats) == 1:
input_node.data_format = data_format_map[input_node_formats[0]]
else:
input_node.data_format = data_format_map[input_node_formats[i]]
input_node.data_type = data_type_map[input_data_types[i]]
input_node.data_format = data_format_map[input_node_formats[i]]
input_node.shape = parse_int_array_from_str(input_node_shapes[i])
if input_node.data_format == cvt.DataFormat.NCHW and\
len(input_node.shape) == 4:
......@@ -166,6 +170,7 @@ def main(unused_args):
option.add_input_node(input_node)
output_node_names = FLAGS.output_node.split(',')
output_data_types = FLAGS.output_data_types.split(',')
output_node_shapes = FLAGS.output_shape.split(':')
output_node_formats = FLAGS.output_data_formats.split(",")
if len(output_node_names) != len(output_node_shapes):
......@@ -173,10 +178,8 @@ def main(unused_args):
for i in six.moves.range(len(output_node_names)):
output_node = cvt.NodeInfo()
output_node.name = output_node_names[i]
if len(output_node_formats) == 1:
output_node.data_format = data_format_map[output_node_formats[0]]
else:
output_node.data_format = data_format_map[output_node_formats[i]]
output_node.data_type = data_type_map[output_data_types[i]]
output_node.data_format = data_format_map[output_node_formats[i]]
output_node.shape = parse_int_array_from_str(output_node_shapes[i])
if output_node.data_format == cvt.DataFormat.NCHW and\
len(output_node.shape) == 4:
......@@ -290,6 +293,11 @@ def parse_args():
type=str,
default="input_node",
help="e.g., input_node")
parser.add_argument(
"--input_data_types",
type=str,
default="float32",
help="e.g., float32|int32")
parser.add_argument(
"--input_data_formats",
type=str,
......@@ -297,6 +305,11 @@ def parse_args():
help="e.g., NHWC,NONE")
parser.add_argument(
"--output_node", type=str, default="softmax", help="e.g., softmax")
parser.add_argument(
"--output_data_types",
type=str,
default="float32",
help="e.g., float32|int32")
parser.add_argument(
"--output_data_formats",
type=str,
......
......@@ -298,6 +298,7 @@ class NodeInfo(object):
def __init__(self):
self._name = None
self._data_type = mace_pb2.DT_FLOAT
self._shape = []
self._data_format = DataFormat.NHWC
self._range = [-1.0, 1.0]
......@@ -306,6 +307,10 @@ class NodeInfo(object):
def name(self):
return self._name
@property
def data_type(self):
return self._data_type
@property
def shape(self):
return self._shape
......@@ -322,6 +327,10 @@ class NodeInfo(object):
def name(self, name):
self._name = name
@data_type.setter
def data_type(self, data_type):
self._data_type = data_type
@shape.setter
def shape(self, shape):
self._shape = shape
......
......@@ -102,6 +102,7 @@ TFSupportedOps = [
'Mean',
'Const',
'Gather',
'GatherV2',
'StridedSlice',
'Slice',
'ReverseV2',
......@@ -241,6 +242,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
TFOpType.Mean.name: self.convert_mean,
TFOpType.Const.name: self.convert_nop,
TFOpType.Gather.name: self.convert_gather,
TFOpType.GatherV2.name: self.convert_gather,
TFOpType.StridedSlice.name: self.convert_stridedslice,
TFOpType.Slice.name: self.convert_slice,
TFOpType.ReverseV2.name: self.convert_reverse,
......@@ -838,16 +840,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
op = self.convert_general_op(tf_op)
op.type = MaceOp.ExpandDims.name
axis_value = tf_op.inputs[1].eval().astype(np.int32)
axis_arg = op.arg.add()
axis_arg.name = MaceKeyword.mace_axis_str
try:
axis_value = tf_op.get_attr('dim')
except ValueError:
try:
axis_value = tf_op.get_attr('axis')
except ValueError:
axis_value = 0
axis_arg.i = axis_value
del op.input[1]
def convert_squeeze(self, tf_op):
op = self.convert_general_op(tf_op)
......
......@@ -323,7 +323,7 @@ class Transformer(base_converter.ConverterInterface):
input_info.name = input_node.name
input_info.data_format = input_node.data_format.value
input_info.dims.extend(input_node.shape)
input_info.data_type = mace_pb2.DT_FLOAT
input_info.data_type = input_node.data_type
output_nodes = self._option.check_nodes.values()
for output_node in output_nodes:
......@@ -332,7 +332,7 @@ class Transformer(base_converter.ConverterInterface):
output_info.data_format = output_node.data_format.value
output_info.dims.extend(
self._producer[output_node.name].output_shape[0].dims)
output_info.data_type = mace_pb2.DT_FLOAT
output_info.data_type = output_node.data_type
return False
......
......@@ -317,17 +317,18 @@ bool RunModel(const std::string &model_name,
std::map<std::string, mace::MaceTensor> outputs;
for (size_t i = 0; i < input_count; ++i) {
// Allocate input and output
// only support float and int32, use char for generalization
// sizeof(int) == 4, sizeof(float) == 4
int64_t input_size =
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 4,
std::multiplies<int64_t>());
auto buffer_in = std::shared_ptr<float>(new float[input_size],
std::default_delete<float[]>());
auto buffer_in = std::shared_ptr<char>(new char[input_size],
std::default_delete<char[]>());
// load input
std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
std::ios::in | std::ios::binary);
if (in_file.is_open()) {
in_file.read(reinterpret_cast<char *>(buffer_in.get()),
input_size * sizeof(float));
in_file.read(buffer_in.get(), input_size);
in_file.close();
} else {
LOG(INFO) << "Open input file failed";
......@@ -338,11 +339,12 @@ bool RunModel(const std::string &model_name,
}
for (size_t i = 0; i < output_count; ++i) {
// only support float and int32, use char for generalization
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
std::multiplies<int64_t>());
auto buffer_out = std::shared_ptr<float>(new float[output_size],
std::default_delete<float[]>());
auto buffer_out = std::shared_ptr<char>(new char[output_size],
std::default_delete<char[]>());
outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
output_data_formats[i]);
}
......@@ -454,12 +456,12 @@ bool RunModel(const std::string &model_name,
std::string output_name =
FLAGS_output_file + "_" + FormatName(output_names[i]);
std::ofstream out_file(output_name, std::ios::binary);
// only support float and int32
int64_t output_size =
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 4,
std::multiplies<int64_t>());
out_file.write(
reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
output_size * sizeof(float));
outputs[output_names[i]].data<char>().get(), output_size);
out_file.flush();
out_file.close();
LOG(INFO) << "Write output file " << output_name << " with size "
......@@ -524,6 +526,7 @@ int Main(int argc, char **argv) {
// get cpu capability
Capability cpu_capability = GetCapability(DeviceType::CPU);
float cpu_float32_performance = cpu_capability.float32_performance.exec_time;
bool ret = false;
for (int i = 0; i < FLAGS_restart_round; ++i) {
......@@ -531,7 +534,7 @@ int Main(int argc, char **argv) {
ret = RunModel(FLAGS_model_name,
input_names, input_shape_vec, input_data_formats,
output_names, output_shape_vec, output_data_formats,
cpu_capability.float32_performance.exec_time);
cpu_float32_performance);
}
if (ret) {
return 0;
......
......@@ -397,6 +397,7 @@ class YAMLKeyword(object):
runtime = 'runtime'
data_type = 'data_type'
input_data_types = 'input_data_types'
output_data_types = 'output_data_types'
input_data_formats = 'input_data_formats'
output_data_formats = 'output_data_formats'
limit_opencl_kernel_time = 'limit_opencl_kernel_time'
......
......@@ -65,13 +65,13 @@ RuntimeTypeStrs = [
"cpu+gpu"
]
InputDataTypeStrs = [
InOutDataTypeStrs = [
"int32",
"float32",
]
InputDataType = Enum('InputDataType',
[(ele, ele) for ele in InputDataTypeStrs],
InOutDataType = Enum('InputDataType',
[(ele, ele) for ele in InOutDataTypeStrs],
type=str)
FPDataTypeStrs = [
......@@ -410,17 +410,23 @@ def format_model_config(flags):
else:
subgraph[key] = []
input_data_types = subgraph.get(YAMLKeyword.input_data_types, "")
if input_data_types:
if not isinstance(input_data_types, list):
subgraph[YAMLKeyword.input_data_types] = [input_data_types]
for input_data_type in subgraph[YAMLKeyword.input_data_types]:
mace_check(input_data_type in InputDataTypeStrs,
ModuleName.YAML_CONFIG,
"'input_data_types' must be in "
+ str(InputDataTypeStrs))
else:
subgraph[YAMLKeyword.input_data_types] = []
for key in [YAMLKeyword.input_data_types,
YAMLKeyword.output_data_types]:
if key == YAMLKeyword.input_data_types:
count = input_size
else:
count = output_size
data_types = subgraph.get(key, "")
if data_types:
if not isinstance(data_types, list):
subgraph[key] = [data_types] * count
for data_type in subgraph[key]:
mace_check(data_type in InOutDataTypeStrs,
ModuleName.YAML_CONFIG,
key + " must be in "
+ str(InOutDataTypeStrs))
else:
subgraph[key] = [InOutDataType.float32] * count
input_data_formats = subgraph.get(YAMLKeyword.input_data_formats,
[])
......@@ -722,8 +728,10 @@ def convert_model(configs, cl_mem_type):
model_config[YAMLKeyword.model_sha256_checksum],
model_config[YAMLKeyword.weight_sha256_checksum],
",".join(subgraphs[0][YAMLKeyword.input_tensors]),
",".join(subgraphs[0][YAMLKeyword.input_data_types]),
",".join(subgraphs[0][YAMLKeyword.input_data_formats]),
",".join(subgraphs[0][YAMLKeyword.output_tensors]),
",".join(subgraphs[0][YAMLKeyword.output_data_types]),
",".join(subgraphs[0][YAMLKeyword.output_data_formats]),
",".join(subgraphs[0][YAMLKeyword.check_tensors]),
runtime,
......
......@@ -480,8 +480,10 @@ def gen_model_code(model_codegen_dir,
model_sha256_checksum,
weight_sha256_checksum,
input_nodes,
input_data_types,
input_data_formats,
output_nodes,
output_data_types,
output_data_formats,
check_nodes,
runtime,
......@@ -515,8 +517,10 @@ def gen_model_code(model_codegen_dir,
"--model_checksum=%s" % model_sha256_checksum,
"--weight_checksum=%s" % weight_sha256_checksum,
"--input_node=%s" % input_nodes,
"--input_data_types=%s" % input_data_types,
"--input_data_formats=%s" % input_data_formats,
"--output_node=%s" % output_nodes,
"--output_data_types=%s" % output_data_types,
"--output_data_formats=%s" % output_data_formats,
"--check_node=%s" % check_nodes,
"--runtime=%s" % runtime,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册