提交 74dcd617 编写于 作者: L liuqi

Refactor: Support auto transformation and net optimization.

1. Auto transformation between data format, data type and memory type.
2. Add device placement optimization strategy.
上级 bde945cd
...@@ -96,6 +96,44 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) ...@@ -96,6 +96,44 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC #undef MACE_GET_REPEATED_ARGUMENT_FUNC
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, T, fieldname) \
template<> \
void SetProtoArg<T>(Def *def, \
const std::string &arg_name, \
const T &value) { \
int size = def->arg_size(); \
for (int i = 0; i < size; ++i) { \
auto arg = def->mutable_arg(i); \
if (arg->name() == arg_name) { \
VLOG(3) << "Update old argument value from " \
<< arg->fieldname() << " to " \
<< value << " for " << arg_name; \
arg->set_##fieldname(value); \
return; \
} \
} \
VLOG(3) << "Add new argument " << arg_name << "(name: " \
<< arg_name << ", value: " << value << ")"; \
auto arg = def->add_arg(); \
arg->set_name(arg_name); \
arg->set_##fieldname(value); \
}
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(Def) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, std::string, s)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
#undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
std::string OutputMemoryTypeTagName() {
static const char *kOutputMemTypeArgName = "output_mem_type";
return kOutputMemTypeArgName;
}
bool IsQuantizedModel(const NetDef &net_def) { bool IsQuantizedModel(const NetDef &net_def) {
return return
......
...@@ -55,6 +55,18 @@ class ProtoArgHelper { ...@@ -55,6 +55,18 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_; std::map<std::string, Argument> arg_map_;
}; };
template <typename T>
void SetProtoArg(OperatorDef *op_def,
const std::string &arg_name,
const T&value);
template <typename T>
void SetProtoArg(NetDef *op_def,
const std::string &arg_name,
const T&value);
std::string OutputMemoryTypeTagName();
bool IsQuantizedModel(const NetDef &def); bool IsQuantizedModel(const NetDef &def);
} // namespace mace } // namespace mace
......
...@@ -33,7 +33,7 @@ namespace mace { ...@@ -33,7 +33,7 @@ namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) { bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = { static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze" "Reshape", "Identity", "Squeeze", "ExpandDims"
}; };
return kReuseOp.count(op_type) == 1; return kReuseOp.count(op_type) == 1;
} }
...@@ -124,8 +124,9 @@ void MemoryOptimizer::Optimize( ...@@ -124,8 +124,9 @@ void MemoryOptimizer::Optimize(
op_def->output_type_size()); op_def->output_type_size());
DataType dt; DataType dt;
bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( DataFormat data_format = static_cast<DataFormat>(
*op_def, "has_data_format", 0) != 0; ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "data_format", DataFormat::DF_NONE));
int output_size = op_def->output_size(); int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) { for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) { if (i < op_def->output_type_size()) {
...@@ -209,7 +210,7 @@ void MemoryOptimizer::Optimize( ...@@ -209,7 +210,7 @@ void MemoryOptimizer::Optimize(
mem_ref_count_[best_mem_id] = 1; mem_ref_count_[best_mem_id] = 1;
} }
tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id, tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
dt, has_data_format)); dt, data_format));
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <vector> #include <vector>
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/types.h" #include "mace/core/types.h"
namespace mace { namespace mace {
...@@ -81,10 +82,10 @@ class MemoryOptimizer { ...@@ -81,10 +82,10 @@ class MemoryOptimizer {
struct TensorMemInfo { struct TensorMemInfo {
int mem_id; int mem_id;
DataType data_type; DataType data_type;
bool has_data_format; DataFormat data_format;
TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) : TensorMemInfo(int mem_id, DataType data_type, DataFormat data_format) :
mem_id(mem_id), data_type(data_type), has_data_format(has_data_format) mem_id(mem_id), data_type(data_type), data_format(data_format)
{} {}
}; };
......
...@@ -31,99 +31,8 @@ ...@@ -31,99 +31,8 @@
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
bool TransformRequiredOp(const std::string &op_type) {
static const std::unordered_set<std::string> kNoTransformOp = {
"Shape", "InferConv2dShape"
};
return kNoTransformOp.count(op_type) == 0;
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_.get());
construct_context->set_operator_def(op_def);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
// Get available devices
auto available_devices =
op_registry->AvailableDevices(op_def->type(), construct_context);
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
return op_registry->CreateOperation(construct_context, device_type);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry, SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def, const NetDef *net_def,
Workspace *ws, Workspace *ws,
...@@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, ...@@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(), target_device->cpu_runtime()->policy(),
&target_device->cpu_runtime()->thread_pool())) { &target_device->cpu_runtime()->thread_pool())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
// Tensor Shape map
std::unordered_map<std::string, std::vector<index_t>> tensor_shape_map;
for (auto &op : net_def->op()) {
if (op.output_size() != op.output_shape_size()) {
continue;
}
for (int i = 0; i < op.output_size(); ++i) {
tensor_shape_map[op.output(i)] = std::vector<index_t>(
op.output_shape(i).dims().begin(),
op.output_shape(i).dims().end());
}
}
for (auto &tensor : net_def->tensors()) {
tensor_shape_map[tensor.name()] =
std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
}
bool has_data_format = false;
if (target_device_->device_type() == DeviceType::CPU) {
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
// Only could be NONE or NHWC
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = has_data_format ||
(input_data_format != DataFormat::DF_NONE);
if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
}
}
}
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
// output tensor : related information
std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization // used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map; std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_set<std::string> transformed_set;
// add input information
MemoryType target_mem_type;
// default data format of output tensor
DataFormat default_output_df = DataFormat::DF_NONE;
if (target_device_->device_type() == DeviceType::GPU) {
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = input_data_format != DataFormat::DF_NONE;
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_data_format,
input_shape, -1));
}
default_output_df =
has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
}
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_, &tensor_shape_map); OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx))); std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create operation // Create operation
auto op = CreateOperation(op_registry, auto op_device_type = static_cast<DeviceType>(op_def->device_type());
&construct_context, if (op_device_type == target_device_->device_type()) {
op_def, construct_context.set_device(target_device_);
has_data_format, } else if (op_device_type == DeviceType::CPU) {
is_quantize_model); construct_context.set_device(cpu_device_.get());
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
// the outputs' memory type of the operation
MemoryType out_mem_type = construct_context.output_mem_type();
int input_size = op_def->input_size();
// if op is memory-unused op, no transformation
if (TransformRequiredOp(op_def->type())) {
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether to do transform
MemoryType wanted_in_mem_type =
construct_context.GetInputMemType(i);
DataType wanted_in_dt = construct_context.GetInputDataType(i);
if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
|| output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
auto t_input_name = TransformedName(op_def->input(i),
wanted_in_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_set.count(t_input_name) == 0) {
VLOG(1) << "Add Transform operation " << op_def->name()
<< " to transform tensor "
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to "
<< wanted_in_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< wanted_in_dt << ". with data format "
<< output_info.data_format;
std::string input_name = op_def->input(i);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
output_info.data_format == DataFormat::NCHW &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name, wanted_in_dt,
construct_context.GetInputOpenCLBufferType(i),
wanted_in_mem_type, has_data_format);
OpConstructContext t_construct_context(ws_);
auto transform_op = CreateOperation(
op_registry,
&t_construct_context,
transform_op_def,
has_data_format);
operators_.emplace_back(std::move(transform_op));
transformed_set.insert(t_input_name);
output_mem_map[t_input_name] = wanted_in_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else { } else {
op_def->set_input(i, t_input_name); LOG(FATAL) << "Encounter unexpected error: "
} << op_device_type << " vs " << target_device_->device_type();
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
}
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
DataType dt;
if (op_def->output_type_size() == op_def->output_size()) {
dt = op_def->output_type(out_idx);
} else {
dt = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
} }
output_mem_map[op_def->output(out_idx)] = out_mem_type; construct_context.set_operator_def(op_def);
output_map.emplace(
op_def->output(out_idx), auto op = op_registry->CreateOperation(&construct_context,
InternalOutputInfo( op_device_type);
out_mem_type,
dt,
default_output_df,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
}
}
#endif // MACE_ENABLE_OPENCL
operators_.emplace_back(std::move(op)); operators_.emplace_back(std::move(op));
// where to do graph reference count. // where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get()); mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) { if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) { // update the map : output_tensor -> Operation
auto &internal_output_info = output_map.at(output_info.name()); MemoryType out_mem_type =
if ((internal_output_info.mem_type != target_mem_type && static_cast<MemoryType>(
internal_output_info.mem_type != MemoryType::CPU_BUFFER) || ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
internal_output_info.dtype != output_info.data_type()) { net_def->op(idx), OutputMemoryTypeTagName(),
VLOG(1) << "Add Transform operation to transform output tensor '" static_cast<int>(MemoryType::CPU_BUFFER)));
<< output_info.name() << "', from memory type " for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
<< internal_output_info.mem_type output_mem_map[op_def->output(out_idx)] = out_mem_type;
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << output_info.data_type();
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
bool output_has_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
output_info.data_type(),
OpenCLBufferType::IN_OUT_CHANNEL,
target_mem_type,
output_has_data_format);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_has_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
}
} }
} }
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
}
// Update output tensor reference // Update output tensor reference
for (auto &output_info : net_def->output_info()) { for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name()); mem_optimizer->UpdateTensorRef(output_info.name());
......
...@@ -54,14 +54,6 @@ class SerialNet : public NetBase { ...@@ -54,14 +54,6 @@ class SerialNet : public NetBase {
MaceStatus Run(RunMetadata *run_metadata = nullptr) override; MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model = false);
protected: protected:
Workspace *ws_; Workspace *ws_;
Device *target_device_; Device *target_device_;
......
此差异已折叠。
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_DEF_ADAPTER_H_
#define MACE_CORE_NET_DEF_ADAPTER_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/operator.h"
#include "mace/core/net_optimizer.h"
namespace mace {
class OpRegistryBase;
class Workspace;
class Device;
/**
* Conventions:
* 1. DataFormat::DT_AUTO stands for formatted (NHWC or NCHW)
* 2. if Op with DataFormat::DT_AUTO, the arguments of this op
* is formatted to NHWC
*/
class NetDefAdapter {
public:
NetDefAdapter(const OpRegistryBase *op_registry,
const Workspace *ws);
MaceStatus AdaptNetDef(
const NetDef *net_def,
Device *target_device,
NetDef *target_net_def);
public:
NetDefAdapter(const NetDefAdapter&) = delete;
NetDefAdapter(const NetDefAdapter&&) = delete;
NetDefAdapter &operator=(const NetDefAdapter &) = delete;
NetDefAdapter &operator=(const NetDefAdapter &&) = delete;
private:
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type;
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
typedef std::unordered_map<std::string, InternalOutputInfo> TensorInfoMap;
private:
MaceStatus AdaptDevice(OpConditionContext *context,
Device *target_device,
Device *cpu_device,
const TensorInfoMap &output_map,
const NetDef *net_def,
OperatorDef *op);
MaceStatus AdaptDataType(OpConditionContext *context,
OperatorDef *op);
MaceStatus AdaptDataFormat(
OpConditionContext *context,
OperatorDef *op,
bool is_quantized_model,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
DataFormat *op_output_df,
NetDef *target_net_def);
MaceStatus AdaptMemoryType(
mace::OpConditionContext *context,
mace::OperatorDef *op_def,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
MemoryType *op_output_mem_types,
mace::NetDef *target_net_def);
std::string DebugString(const NetDef *net_def);
private:
const OpRegistryBase *op_registry_;
const Workspace *ws_;
NetOptimizer net_optimizer_;
};
} // namespace mace
#endif // MACE_CORE_NET_DEF_ADAPTER_H_
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/net_optimizer.h"
#include <string>
namespace mace {
DeviceType NetOptimizer::SelectBestDevice(
const mace::OperatorDef *op_def,
DeviceType target_device_type,
const std::set<mace::DeviceType> &available_devices,
const std::vector<mace::DeviceType> &inputs_op_devices) {
static const std::set<std::string> kComputeIntensiveOps = {
"Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
"FullyConnected"
};
// CPU is the device to fall back
DeviceType best_device = DeviceType::CPU;
if (available_devices.count(target_device_type) == 1) {
best_device = target_device_type;
}
if (best_device == DeviceType::CPU) {
return best_device;
}
// Put compute-intensive ops in target device
if (kComputeIntensiveOps.count(op_def->type()) == 1) {
return best_device;
}
// Greedy strategy: Use input op's device type as current op's device
for (auto device_type : inputs_op_devices) {
if (device_type != best_device) {
best_device = device_type;
}
}
return best_device;
}
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_OPTIMIZER_H_
#define MACE_CORE_NET_OPTIMIZER_H_
#include <set>
#include <vector>
#include "mace/port/port.h"
#include "mace/proto/mace.pb.h"
namespace mace {
class NetOptimizer {
public:
DeviceType SelectBestDevice(const OperatorDef *op_def,
DeviceType target_device,
const std::set<DeviceType> &available_devices,
const std::vector<DeviceType> &inputs_op_devices);
};
} // namespace mace
#endif // MACE_CORE_NET_OPTIMIZER_H_
...@@ -20,34 +20,21 @@ ...@@ -20,34 +20,21 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
namespace mace { namespace mace {
OpConditionContext::OpConditionContext(
OpConstructContext::OpConstructContext(Workspace *ws) const mace::Workspace *ws,
: operator_def_(nullptr), mace::OpConditionContext::TensorShapeMap *info)
ws_(ws),
device_(nullptr),
tensor_shape_info_(nullptr) {}
OpConstructContext::OpConstructContext(
mace::Workspace *ws,
mace::OpConstructContext::TensorShapeMap *info)
: operator_def_(nullptr), : operator_def_(nullptr),
ws_(ws), ws_(ws),
device_(nullptr), device_(nullptr),
tensor_shape_info_(info) {} tensor_shape_info_(info) {}
void OpConstructContext::set_operator_def( void OpConditionContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) { const mace::OperatorDef *operator_def) {
operator_def_ = operator_def; operator_def_ = operator_def;
input_data_types_.clear(); input_data_types_.clear();
} }
void OpConstructContext::set_output_mem_type(mace::MemoryType type) { void OpConditionContext::SetInputInfo(size_t idx,
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
void OpConstructContext::SetInputInfo(size_t idx,
mace::MemoryType mem_type, mace::MemoryType mem_type,
mace::DataType dt) { mace::DataType dt) {
if (input_mem_types_.empty()) { if (input_mem_types_.empty()) {
...@@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx, ...@@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx,
input_data_types_[idx] = dt; input_data_types_[idx] = dt;
} }
MemoryType OpConstructContext::GetInputMemType(size_t idx) const { void OpConditionContext::set_output_mem_type(mace::MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
if (input_mem_types_.empty()) { if (input_mem_types_.empty()) {
return output_mem_type_; return output_mem_type_;
} }
...@@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const { ...@@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
return input_mem_types_[idx]; return input_mem_types_[idx];
} }
DataType OpConstructContext::GetInputDataType(size_t idx) const { DataType OpConditionContext::GetInputDataType(size_t idx) const {
if (input_data_types_.empty()) { if (input_data_types_.empty()) {
// the default inputs' data types are same as operation's data type. // the default inputs' data types are same as operation's data type.
return static_cast<DataType>( return static_cast<DataType>(
...@@ -87,7 +80,7 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const { ...@@ -87,7 +80,7 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
void OpConstructContext::SetInputOpenCLBufferType( void OpConditionContext::SetInputOpenCLBufferType(
size_t idx, OpenCLBufferType buffer_type) { size_t idx, OpenCLBufferType buffer_type) {
if (input_opencl_buffer_types_.empty()) { if (input_opencl_buffer_types_.empty()) {
// the default inputs' memory types are same as output memory type. // the default inputs' memory types are same as output memory type.
...@@ -97,7 +90,7 @@ void OpConstructContext::SetInputOpenCLBufferType( ...@@ -97,7 +90,7 @@ void OpConstructContext::SetInputOpenCLBufferType(
MACE_CHECK(idx < input_opencl_buffer_types_.size()); MACE_CHECK(idx < input_opencl_buffer_types_.size());
input_opencl_buffer_types_[idx] = buffer_type; input_opencl_buffer_types_[idx] = buffer_type;
} }
OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType( OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
size_t idx) const { size_t idx) const {
if (input_opencl_buffer_types_.empty()) { if (input_opencl_buffer_types_.empty()) {
return OpenCLBufferType::IN_OUT_CHANNEL; return OpenCLBufferType::IN_OUT_CHANNEL;
...@@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType( ...@@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
} }
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) {
operator_def_ = operator_def;
}
OpInitContext::OpInitContext(Workspace *ws, Device *device) OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {} : ws_(ws), device_(device) {}
...@@ -202,16 +205,26 @@ const std::string OpKeyBuilder::Build() { ...@@ -202,16 +205,26 @@ const std::string OpKeyBuilder::Build() {
} // namespace } // namespace
OpRegistrationInfo::OpRegistrationInfo() { OpRegistrationInfo::OpRegistrationInfo() {
device_placer = [this](OpConstructContext *context) -> std::set<DeviceType> { // default device type placer
auto op = context->operator_def(); device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
// The GPU ops only support 4D In/Out tensor by default MACE_UNUSED(context);
if (this->devices.count(DeviceType::CPU) == 1 &&
op->output_shape_size() == op->output_size() &&
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return this->devices; return this->devices;
}; };
// default input and output memory type setter
memory_type_setter = [](OpConditionContext *context) -> void {
if (context->device()->device_type() == DeviceType::GPU) {
#ifdef MACE_ENABLE_OPENCL
if (context->device()->gpu_runtime()->UseImageMemory()) {
context->set_output_mem_type(MemoryType::GPU_IMAGE);
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
#endif // MACE_ENABLE_OPENCL
} else {
context->set_output_mem_type(MemoryType::CPU_BUFFER);
}
};
} }
void OpRegistrationInfo::AddDevice(mace::DeviceType device) { void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
...@@ -255,13 +268,21 @@ MaceStatus OpRegistryBase::Register( ...@@ -255,13 +268,21 @@ MaceStatus OpRegistryBase::Register(
} }
const std::set<DeviceType> OpRegistryBase::AvailableDevices( const std::set<DeviceType> OpRegistryBase::AvailableDevices(
const std::string &op_type, OpConstructContext *context) const { const std::string &op_type, OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0, MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered."); op_type, " operation is not registered.");
return registry_.at(op_type)->device_placer(context); return registry_.at(op_type)->device_placer(context);
} }
void OpRegistryBase::GetInOutMemoryTypes(
const std::string &op_type,
mace::OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->memory_type_setter(context);
}
std::unique_ptr<Operation> OpRegistryBase::CreateOperation( std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type) const { DeviceType device_type) const {
...@@ -269,15 +290,6 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation( ...@@ -269,15 +290,6 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
DataType dtype = static_cast<DataType>( DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT))); *operator_def, "T", static_cast<int>(DT_FLOAT)));
if (device_type == DeviceType::CPU && dtype == DT_HALF) {
int arg_size = operator_def->arg_size();
for (int i = 0; i < arg_size; ++i) {
if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "(" VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on " << operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type; << device_type;
...@@ -308,10 +320,21 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc( ...@@ -308,10 +320,21 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
return *this; return *this;
} }
OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
mace::OpRegistrationInfo::MemoryTypeSetter setter) {
memory_type_setter_ = setter;
return *this;
}
void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const { void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
if (info != nullptr && placer_) { if (info != nullptr) {
if (placer_) {
info->device_placer = placer_; info->device_placer = placer_;
} }
if (memory_type_setter_) {
info->memory_type_setter = memory_type_setter_;
}
}
} }
} // namespace mace } // namespace mace
...@@ -32,22 +32,20 @@ ...@@ -32,22 +32,20 @@
namespace mace { namespace mace {
// memory_optimizer, device // OpConditionContext has all information used for choosing proper Op
class OpConstructContext { class OpConditionContext {
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
public: public:
explicit OpConstructContext(Workspace *ws); typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
OpConstructContext(Workspace *ws, TensorShapeMap *info); OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConstructContext() = default; ~OpConditionContext() = default;
void set_operator_def(std::shared_ptr<OperatorDef> operator_def); void set_operator_def(const OperatorDef* operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const { inline const OperatorDef *operator_def() const {
return operator_def_; return operator_def_;
} }
inline Workspace *workspace() const { inline const Workspace *workspace() const {
return ws_; return ws_;
} }
...@@ -81,8 +79,8 @@ class OpConstructContext { ...@@ -81,8 +79,8 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
private: private:
std::shared_ptr<OperatorDef> operator_def_; const OperatorDef *operator_def_;
Workspace *ws_; const Workspace *ws_;
Device *device_; Device *device_;
TensorShapeMap *tensor_shape_info_; TensorShapeMap *tensor_shape_info_;
// used for memory transform // used for memory transform
...@@ -94,6 +92,38 @@ class OpConstructContext { ...@@ -94,6 +92,38 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
}; };
// memory_optimizer, device
class OpConstructContext {
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
public:
explicit OpConstructContext(Workspace *ws);
~OpConstructContext() = default;
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_;
}
inline Workspace *workspace() const {
return ws_;
}
inline void set_device(Device* device) {
device_ = device;
}
inline Device *device() const {
return device_;
}
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
Device *device_;
};
// memory_optimizer, device // memory_optimizer, device
class OpInitContext { class OpInitContext {
public: public:
...@@ -207,8 +237,11 @@ struct OpRegistrationInfo { ...@@ -207,8 +237,11 @@ struct OpRegistrationInfo {
public: public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)> typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
OpCreator; OpCreator;
typedef std::function<std::set<DeviceType>(OpConstructContext *)> typedef std::function<std::set<DeviceType>(OpConditionContext *)>
DevicePlacer; DevicePlacer;
typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
DataFormatSelector;
OpRegistrationInfo(); OpRegistrationInfo();
...@@ -219,6 +252,8 @@ struct OpRegistrationInfo { ...@@ -219,6 +252,8 @@ struct OpRegistrationInfo {
std::set<DeviceType> devices; std::set<DeviceType> devices;
std::unordered_map<std::string, OpCreator> creators; std::unordered_map<std::string, OpCreator> creators;
DevicePlacer device_placer; DevicePlacer device_placer;
MemoryTypeSetter memory_type_setter;
DataFormatSelector data_format_selector;
}; };
class OpConditionBuilder { class OpConditionBuilder {
...@@ -230,11 +265,18 @@ class OpConditionBuilder { ...@@ -230,11 +265,18 @@ class OpConditionBuilder {
OpConditionBuilder &SetDevicePlacerFunc( OpConditionBuilder &SetDevicePlacerFunc(
OpRegistrationInfo::DevicePlacer placer); OpRegistrationInfo::DevicePlacer placer);
// If you set input memory type for specified Op,
// you must call OpConditionContext::set_output_mem_type
OpConditionBuilder &SetInputMemoryTypeSetter(
OpRegistrationInfo::MemoryTypeSetter setter);
void Finalize(OpRegistrationInfo *info) const; void Finalize(OpRegistrationInfo *info) const;
private: private:
std::string type_; std::string type_;
OpRegistrationInfo::DevicePlacer placer_; OpRegistrationInfo::DevicePlacer placer_;
OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
OpRegistrationInfo::DataFormatSelector data_format_selector_;
}; };
...@@ -250,7 +292,10 @@ class OpRegistryBase { ...@@ -250,7 +292,10 @@ class OpRegistryBase {
MaceStatus Register(const OpConditionBuilder &builder); MaceStatus Register(const OpConditionBuilder &builder);
const std::set<DeviceType> AvailableDevices( const std::set<DeviceType> AvailableDevices(
const std::string &op_type, OpConstructContext *context) const; const std::string &op_type, OpConditionContext *context) const;
void GetInOutMemoryTypes(
const std::string &op_type, OpConditionContext *context) const;
std::unique_ptr<Operation> CreateOperation( std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context, OpConstructContext *context,
......
...@@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
} }
} }
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef( void OpenCLUtil::BuildTransformOpDef(
const std::string &input_name, const std::string &input_name,
const std::vector<mace::index_t> &input_shape, const std::vector<mace::index_t> &input_shape,
const std::string &output_name, const std::string &output_name,
const mace::DataType dt, const mace::DataType dt,
const OpenCLBufferType buffer_type, const OpenCLBufferType buffer_type,
const mace::MemoryType mem_type, const mace::MemoryType mem_type,
bool has_data_format) { DataFormat data_format,
std::unique_ptr<OperatorDef> op(new OperatorDef); OperatorDef *op_def) {
std::string op_name = "mace_node_" + output_name; std::string op_name = "mace_node_" + output_name;
op->set_name(op_name); op_def->set_name(op_name);
op->set_type("BufferTransform"); op_def->set_type("BufferTransform");
op->add_input(input_name); op_def->add_input(input_name);
op->add_output(output_name); op_def->add_output(output_name);
Argument *arg = op->add_arg(); op_def->set_device_type(DeviceType::GPU);
Argument *arg = op_def->add_arg();
arg->set_name("buffer_type"); arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(buffer_type)); arg->set_i(static_cast<int32_t>(buffer_type));
arg = op->add_arg(); arg = op_def->add_arg();
arg->set_name("mem_type"); arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type)); arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg(); arg = op_def->add_arg();
arg->set_name("T"); arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt)); arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg(); arg = op_def->add_arg();
arg->set_name("has_data_format"); arg->set_name("data_format");
arg->set_i(has_data_format); arg->set_i(data_format);
if (!input_shape.empty()) { if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape(); OutputShape *shape = op_def->add_output_shape();
for (auto value : input_shape) { for (auto value : input_shape) {
shape->add_dims(value); shape->add_dims(value);
} }
} }
return std::move(op);
} }
} // namespace mace } // namespace mace
...@@ -43,14 +43,15 @@ class OpenCLUtil { ...@@ -43,14 +43,15 @@ class OpenCLUtil {
std::vector<size_t> *image_shape, std::vector<size_t> *image_shape,
const int wino_blk_size = 2); const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef( static void BuildTransformOpDef(
const std::string &input_name, const std::string &input_name,
const std::vector<mace::index_t> &input_shape, const std::vector<mace::index_t> &input_shape,
const std::string &output_name, const std::string &output_name,
const mace::DataType dt, const mace::DataType dt,
const OpenCLBufferType buffer_type, const OpenCLBufferType buffer_type,
const MemoryType mem_type, const MemoryType mem_type,
bool has_data_format); DataFormat data_format,
OperatorDef *op_def);
}; };
} // namespace mace } // namespace mace
......
...@@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor( ...@@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor(
} }
} }
VLOG(1) << "Preallocate buffer to tensors"; VLOG(1) << "Preallocate buffer to tensors";
bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) { for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id), (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
tensor_mem.second.data_type, tensor_mem.second.data_type,
false, tensor_mem.first)); false, tensor_mem.first));
if (tensor_mem.second.has_data_format) { tensor->set_data_format(tensor_mem.second.data_format);
if (tensor_mem.second.data_format != DataFormat::DF_NONE) {
if (mem_blocks[tensor_mem.second.mem_id].mem_type() if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_IMAGE) { == MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first VLOG(1) << "Tensor: " << tensor_mem.first
...@@ -279,23 +279,13 @@ MaceStatus Workspace::PreallocateOutputTensor( ...@@ -279,23 +279,13 @@ MaceStatus Workspace::PreallocateOutputTensor(
<< tensor->UnderlyingBuffer()->shape()[0] << tensor->UnderlyingBuffer()->shape()[0]
<< ", " << ", "
<< tensor->UnderlyingBuffer()->shape()[1]; << tensor->UnderlyingBuffer()->shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else { } else {
VLOG(1) << "Tensor: " << tensor_mem.first VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.mem_id << " Mem: " << tensor_mem.second.mem_id
<< " Data type: " << tensor->dtype() << " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size(); << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_BUFFER ||
is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else {
tensor->set_data_format(DataFormat::NCHW);
} }
} }
} else {
tensor->set_data_format(DataFormat::DF_NONE);
}
tensor_map_[tensor_mem.first] = std::move(tensor); tensor_map_[tensor_mem.first] = std::move(tensor);
} }
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/port/env.h" #include "mace/port/env.h"
#include "mace/port/file_system.h" #include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h" #include "mace/core/runtime/opencl/gpu_device.h"
...@@ -516,20 +517,26 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -516,20 +517,26 @@ MaceStatus MaceEngine::Impl::Init(
device_.get(), device_.get(),
model_data)); model_data));
NetDef adapted_net_def;
NetDefAdapter net_def_adapter(op_registry_.get(), ws_.get());
net_def_adapter.AdaptNetDef(net_def, device_.get(), &adapted_net_def);
MemoryOptimizer mem_optimizer; MemoryOptimizer mem_optimizer;
// Init model // Init model
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(), net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def, &adapted_net_def,
ws_.get(), ws_.get(),
device_.get(), device_.get(),
&mem_optimizer)); &mem_optimizer));
// Preallocate all output tensors of ops // Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(adapted_net_def,
&mem_optimizer, &mem_optimizer,
device_.get())); device_.get()));
if (device_type_ == DeviceType::GPU) { if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); ws_->RemoveAndReloadBuffer(adapted_net_def,
model_data,
device_->allocator());
} }
MACE_RETURN_IF_ERROR(net_->Init()); MACE_RETURN_IF_ERROR(net_->Init());
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#include "mace/ops/activation.h" #include "mace/ops/activation.h"
#include <memory> #include <memory>
#include <set>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
...@@ -132,6 +134,22 @@ void RegisterActivation(OpRegistryBase *op_registry) { ...@@ -132,6 +134,22 @@ void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp, MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half); DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
} }
} // namespace ops } // namespace ops
......
...@@ -103,6 +103,22 @@ void RegisterAddN(OpRegistryBase *op_registry) { ...@@ -103,6 +103,22 @@ void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half); MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
} }
} // namespace ops } // namespace ops
......
...@@ -145,6 +145,22 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) { ...@@ -145,6 +145,22 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp, MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half); DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
} }
} // namespace ops } // namespace ops
......
...@@ -39,14 +39,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -39,14 +39,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
auto type = auto type =
static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>( static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(CONV2D_FILTER))); "buffer_type", static_cast<int>(CONV2D_FILTER)));
bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0) DataFormat data_format = static_cast<DataFormat>(
!= 0; Operation::GetOptionalArg<int>("data_format", DataFormat::DF_NONE));
MemoryType in_mem_type = context->workspace()->GetTensor( MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type(); operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform( return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, context, input, type, out_mem_type_, wino_blk_size_,
has_data_format, output); data_format, output);
} }
private: private:
......
...@@ -116,10 +116,10 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) { ...@@ -116,10 +116,10 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
op_registry, op_registry,
OpConditionBuilder("ChannelShuffle") OpConditionBuilder("ChannelShuffle")
.SetDevicePlacerFunc( .SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return { DeviceType::CPU };
} }
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1); *op, "group", 1);
......
...@@ -241,13 +241,11 @@ void RegisterConcat(OpRegistryBase *op_registry) { ...@@ -241,13 +241,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
op_registry, op_registry,
OpConditionBuilder("Concat") OpConditionBuilder("Concat")
.SetDevicePlacerFunc( .SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
auto tensor_shape_info = context->tensor_shape_info(); auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size() ||
return { DeviceType::CPU, DeviceType::GPU }; op->output_shape(0).dims_size() != 4) {
}
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU }; return { DeviceType::CPU };
} else { } else {
int has_data_format = int has_data_format =
......
...@@ -466,7 +466,6 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -466,7 +466,6 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
} }
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format // Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) && if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd( (kernel_->CheckUseWinograd(
......
...@@ -145,6 +145,22 @@ void RegisterCrop(OpRegistryBase *op_registry) { ...@@ -145,6 +145,22 @@ void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp, MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half); DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
} }
} // namespace ops } // namespace ops
......
...@@ -197,7 +197,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { ...@@ -197,7 +197,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
OpenCLBufferType::ARGUMENT, OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} }
context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
} }
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
...@@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) { ...@@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp, MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half); DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
MACE_NOT_IMPLEMENTED;
}
FrameworkType framework_type =
static_cast<ops::FrameworkType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*(context->operator_def()), "framework_type",
FrameworkType::TENSORFLOW));
if (framework_type == FrameworkType::TENSORFLOW) {
context->SetInputInfo(2, MemoryType::CPU_BUFFER,
DataType::DT_INT32);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -382,7 +382,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { ...@@ -382,7 +382,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type = MemoryType::GPU_BUFFER; mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>(); kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
} }
context->set_output_mem_type(mem_type);
Tensor *filter_tensor = context->workspace()->GetTensor( Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1)); operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) { if (filter_tensor != nullptr && filter_tensor->is_weight()) {
...@@ -393,8 +392,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { ...@@ -393,8 +392,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
1, 1,
OpenCLBufferType::DW_CONV2D_FILTER, OpenCLBufferType::DW_CONV2D_FILTER,
mem_type) == MaceStatus::MACE_SUCCESS); mem_type) == MaceStatus::MACE_SUCCESS);
} else {
context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER);
} }
if (operator_def_->input_size() > 2) { if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>( MACE_CHECK(TransformFilter<T>(
...@@ -440,6 +437,27 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) { ...@@ -440,6 +437,27 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d", MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half); DepthwiseConv2dOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
mem_type = MemoryType::GPU_BUFFER;
}
auto filter_tensor = context->workspace()->GetTensor(
context->operator_def()->input(1));
if (filter_tensor == nullptr || !filter_tensor->is_weight()) {
context->SetInputOpenCLBufferType(
1, OpenCLBufferType::DW_CONV2D_FILTER);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
} }
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
namespace mace { namespace mace {
...@@ -44,27 +43,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation { ...@@ -44,27 +43,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
std::vector<index_t> output_shape(input_shape); std::vector<index_t> output_shape(input_shape);
output_shape.insert(output_shape.begin() + axis_, 1); output_shape.insert(output_shape.begin() + axis_, 1);
bool has_data_format = Operation::GetOptionalArg<int>( output->ReuseTensorBuffer(*input);
"has_data_format", 0) == 1; output->Reshape(output_shape);
if (has_data_format && output_shape.size() == 4) {
// only tensorflow support expand dim, so the default format is NHWC
// transform NHWC to NCHW
auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
{0, 3, 1, 2});
output->Resize(t_output_shape);
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
Transpose(&context->device()->cpu_runtime()->thread_pool(),
input_data, output_shape, {0, 3, 1, 2}, output_data);
} else {
output->Resize(output_shape);
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<T>();
output->Copy<T>(input_data, input->size());
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
......
...@@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) { ...@@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp, MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, uint8_t); DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
} }
} // namespace ops } // namespace ops
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "mace/ops/opencl/image/buffer_to_image.h" #include "mace/ops/opencl/image/buffer_to_image.h"
#include "mace/ops/opencl/image/image_to_buffer.h" #include "mace/ops/opencl/image/image_to_buffer.h"
#include "mace/ops/opencl/buffer/buffer_transform.h" #include "mace/ops/opencl/buffer/buffer_transform.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
namespace mace { namespace mace {
...@@ -48,7 +47,7 @@ class OpenCLBufferTransformer { ...@@ -48,7 +47,7 @@ class OpenCLBufferTransformer {
const OpenCLBufferType type, const OpenCLBufferType type,
const MemoryType out_mem_type, const MemoryType out_mem_type,
const int wino_blk_size, const int wino_blk_size,
bool has_data_format, DataFormat data_format,
Tensor *output) { Tensor *output) {
Workspace *ws = context->workspace(); Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value; DataType dt = DataTypeToEnum<T>::value;
...@@ -67,31 +66,12 @@ class OpenCLBufferTransformer { ...@@ -67,31 +66,12 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform CPU Buffer " << input->name() VLOG(2) << "Transform CPU Buffer " << input->name()
<< " to GPU Buffer " << internal_tensor->name() << " to GPU Buffer " << internal_tensor->name()
<< " with data type " << dt; << " with data type " << dt;
if (has_data_format && input->shape().size() == 4) { MACE_CHECK(data_format == DataFormat::NHWC);
// 1. (NCHW -> NHWC)
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(input->shape(),
dst_dims);
internal_tensor->Resize(output_shape);
internal_tensor->set_data_format(DataFormat::NHWC);
// TODO(liuqi): Only support float now
const float *input_ptr = input->data<float>();
Tensor::MappingGuard guard(internal_tensor);
float *internal_ptr = internal_tensor->mutable_data<float>();
MACE_RETURN_IF_ERROR(ops::Transpose(
&context->device()->cpu_runtime()->thread_pool(),
input_ptr,
input->shape(),
dst_dims,
internal_ptr));
} else {
internal_tensor->Resize(input->shape()); internal_tensor->Resize(input->shape());
const uint8_t *input_ptr = input->data<uint8_t>(); const uint8_t *input_ptr = input->data<uint8_t>();
Tensor::MappingGuard guard(internal_tensor); Tensor::MappingGuard guard(internal_tensor);
uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>(); uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
memcpy(internal_ptr, input_ptr, input->raw_size()); memcpy(internal_ptr, input_ptr, input->raw_size());
}
// 2. convert the internal GPU Buffer to output. // 2. convert the internal GPU Buffer to output.
return kernel_->Compute( return kernel_->Compute(
context, internal_tensor, type, wino_blk_size, output); context, internal_tensor, type, wino_blk_size, output);
...@@ -108,30 +88,13 @@ class OpenCLBufferTransformer { ...@@ -108,30 +88,13 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
<< " to CPU Buffer " << output->name() << " to CPU Buffer " << output->name()
<< " with data type " << dt; << " with data type " << dt;
if (has_data_format && internal_tensor.shape().size() == 4) { MACE_CHECK(data_format == DataFormat::NHWC);
// NHWC -> NCHW
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(internal_tensor.shape(),
dst_dims);
output->set_data_format(DataFormat::NCHW);
Tensor::MappingGuard guard(&internal_tensor);
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(output_shape);
float *output_ptr = output->mutable_data<float>();
return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(),
internal_ptr,
internal_tensor.shape(),
dst_dims,
output_ptr);
} else {
Tensor::MappingGuard guard(&internal_tensor); Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>(); const T *internal_ptr = internal_tensor.data<T>();
output->Resize(internal_tensor.shape()); output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
}
} else { } else {
LOG(FATAL) << "Unexpected error: " << out_mem_type; LOG(FATAL) << "Unexpected error: " << out_mem_type;
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
......
...@@ -71,14 +71,17 @@ MaceStatus EltwiseKernel<T>::Compute( ...@@ -71,14 +71,17 @@ MaceStatus EltwiseKernel<T>::Compute(
if (input1 == nullptr) { if (input1 == nullptr) {
input1_type = "INPUT_SCALAR"; input1_type = "INPUT_SCALAR";
} else { } else {
MACE_CHECK(input0->dim_size() == input1->dim_size() || MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1) input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape"; << "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL) MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL"; << "Eltwise op on GPU does not support EQUAL";
// broadcast // broadcast
if (input0->size() != input1->size()) { if (input0->size() != input1->size() ||
if (input0->size() < input1->size()) { input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1); std::swap(input0, input1);
swapped = true; swapped = true;
} }
......
...@@ -59,11 +59,6 @@ MaceStatus ReduceKernel<T>::Compute( ...@@ -59,11 +59,6 @@ MaceStatus ReduceKernel<T>::Compute(
const Tensor *input, const Tensor *input,
Tensor *output) { Tensor *output) {
MACE_CHECK_NOTNULL(input); MACE_CHECK_NOTNULL(input);
MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
MACE_CHECK(input->dim_size() == 4,
"reduce gpu only support 4-dim input");
MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
"reduce gpu only support 1,2-axis reduce");
index_t batch = input->dim(0); index_t batch = input->dim(0);
const index_t in_height = input->dim(1); const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2); const index_t in_width = input->dim(2);
......
...@@ -480,7 +480,6 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase { ...@@ -480,7 +480,6 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
if (context->device()->gpu_runtime()->UseImageMemory()) { if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::PoolingKernel<T>>(); kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
} else { } else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>(); kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
} }
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <set>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -907,6 +908,31 @@ void RegisterReduce(OpRegistryBase *op_registry) { ...@@ -907,6 +908,31 @@ void RegisterReduce(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp, MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, half); DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Reduce")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
bool keep_dims =
ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
*op, "keepdims", false);
if (!keep_dims) {
return { DeviceType::CPU };
}
auto axis =
ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
*op, "axis");
if (axis.size() != 2 || axis[0] != 1 || axis[1] == 2) {
return { DeviceType::CPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (tensor_shape_info->count(op->input(0)) == 0
|| tensor_shape_info->at(op->input(0)).size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
} }
} // namespace ops } // namespace ops
......
...@@ -100,11 +100,7 @@ class ScalarMathOp : public Operation { ...@@ -100,11 +100,7 @@ class ScalarMathOp : public Operation {
coeff_(Operation::GetRepeatedArgs<float>("coeff")), coeff_(Operation::GetRepeatedArgs<float>("coeff")),
scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)), scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
scalar_input_index_(Operation::GetOptionalArg<int32_t>( scalar_input_index_(Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1)) { "scalar_input_index", 1)) {}
if (D == DeviceType::GPU) {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
......
...@@ -417,7 +417,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation { ...@@ -417,7 +417,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
if (context->device()->gpu_runtime()->UseImageMemory()) { if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log); kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
} else { } else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log); kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
} }
} }
...@@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) { ...@@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
op_registry, op_registry,
OpConditionBuilder("Softmax") OpConditionBuilder("Softmax")
.SetDevicePlacerFunc( .SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return { DeviceType::CPU, DeviceType::GPU };
......
...@@ -144,10 +144,10 @@ void RegisterSplit(OpRegistryBase *op_registry) { ...@@ -144,10 +144,10 @@ void RegisterSplit(OpRegistryBase *op_registry) {
op_registry, op_registry,
OpConditionBuilder("Split") OpConditionBuilder("Split")
.SetDevicePlacerFunc( .SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return {DeviceType::CPU, DeviceType::GPU}; return { DeviceType::CPU };
} }
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3); *op, "axis", 3);
......
...@@ -77,7 +77,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) { ...@@ -77,7 +77,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
op_registry, op_registry,
OpConditionBuilder("Squeeze") OpConditionBuilder("Squeeze")
.SetDevicePlacerFunc( .SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> { [](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def(); auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) { if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU }; return { DeviceType::CPU, DeviceType::GPU };
......
...@@ -36,7 +36,8 @@ enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 }; ...@@ -36,7 +36,8 @@ enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
enum DataFormat { enum DataFormat {
DF_NONE = 0, NHWC = 1, NCHW = 2, DF_NONE = 0, NHWC = 1, NCHW = 2,
HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103,
DF_AUTO = 1000,
}; };
enum GPUPerfHint { enum GPUPerfHint {
......
...@@ -33,6 +33,7 @@ class DataFormat(Enum): ...@@ -33,6 +33,7 @@ class DataFormat(Enum):
OIHW = 101 OIHW = 101
HWOI = 102 HWOI = 102
OHWI = 103 OHWI = 103
DF_AUTO = 1000
# SAME_LOWER: if the amount of paddings to be added is odd, # SAME_LOWER: if the amount of paddings to be added is odd,
...@@ -161,13 +162,39 @@ MaceSupportedOps = [ ...@@ -161,13 +162,39 @@ MaceSupportedOps = [
'SumGroup', 'SumGroup',
'TargetRMSNorm', 'TargetRMSNorm',
'Transpose', 'Transpose',
'WinogradInverseTransform',
'WinogradTransform',
'Cumsum', 'Cumsum',
] ]
MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str) MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str)
MaceHasDataFormatOps = [MaceOp.BatchNorm,
MaceOp.BatchToSpaceND,
MaceOp.Conv2D,
MaceOp.Deconv2D,
MaceOp.DepthToSpace,
MaceOp.DepthwiseConv2d,
MaceOp.DepthwiseDeconv2d,
MaceOp.FullyConnected,
MaceOp.Pooling,
MaceOp.ResizeBicubic,
MaceOp.ResizeBilinear,
MaceOp.ResizeNearestNeighbor,
MaceOp.SpaceToBatchND,
MaceOp.SpaceToDepth]
MaceMayHasDataFormatOps = [MaceOp.Activation,
MaceOp.AddN,
MaceOp.BiasAdd,
MaceOp.ChannelShuffle,
MaceOp.Concat,
MaceOp.Crop,
MaceOp.Eltwise,
MaceOp.Pad,
MaceOp.Reduce,
MaceOp.Softmax,
MaceOp.Split,
MaceOp.SqrDiffMean]
class MaceKeyword(object): class MaceKeyword(object):
# node related str # node related str
...@@ -505,12 +532,11 @@ class ConverterOption(object): ...@@ -505,12 +532,11 @@ class ConverterOption(object):
TransformerRule.TRANSFORM_CHANNEL_SHUFFLE, TransformerRule.TRANSFORM_CHANNEL_SHUFFLE,
# Model data format related transformation # Model data format related transformation
TransformerRule.TRANSPOSE_FILTERS, TransformerRule.TRANSPOSE_FILTERS,
TransformerRule.TRANSPOSE_DATA_FORMAT, # Mace model structure related transformation
TransformerRule.ADD_IN_OUT_TENSOR_INFO,
TransformerRule.TRANSPOSE_MATMUL_WEIGHT, TransformerRule.TRANSPOSE_MATMUL_WEIGHT,
# Add winograd argument # Add winograd argument
TransformerRule.ADD_WINOGRAD_ARG, TransformerRule.ADD_WINOGRAD_ARG,
# Mace model structure related transformation
TransformerRule.ADD_IN_OUT_TENSOR_INFO,
# Data type related transformation # Data type related transformation
TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE, TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE,
# Transform finalization # Transform finalization
...@@ -519,6 +545,7 @@ class ConverterOption(object): ...@@ -519,6 +545,7 @@ class ConverterOption(object):
TransformerRule.SORT_BY_EXECUTION, TransformerRule.SORT_BY_EXECUTION,
# update the data format of ops # update the data format of ops
TransformerRule.UPDATE_DATA_FORMAT, TransformerRule.UPDATE_DATA_FORMAT,
TransformerRule.TRANSPOSE_DATA_FORMAT,
# Need to be put after SORT_BY_EXECUTION # Need to be put after SORT_BY_EXECUTION
TransformerRule.ADD_QUANTIZE_TENSOR_RANGE, TransformerRule.ADD_QUANTIZE_TENSOR_RANGE,
] ]
...@@ -571,6 +598,8 @@ class ConverterUtil(object): ...@@ -571,6 +598,8 @@ class ConverterUtil(object):
return DataFormat.NHWC return DataFormat.NHWC
elif arg.i == DataFormat.NCHW.value: elif arg.i == DataFormat.NCHW.value:
return DataFormat.NCHW return DataFormat.NCHW
elif arg.i == DataFormat.DF_AUTO.value:
return DataFormat.DF_AUTO
else: else:
return None return None
......
...@@ -195,6 +195,7 @@ class CaffeConverter(base_converter.ConverterInterface): ...@@ -195,6 +195,7 @@ class CaffeConverter(base_converter.ConverterInterface):
self._option = option self._option = option
self._mace_net_def = mace_pb2.NetDef() self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NCHW)
self._caffe_net = CaffeNet() self._caffe_net = CaffeNet()
self._caffe_layers = caffe_pb2.NetParameter() self._caffe_layers = caffe_pb2.NetParameter()
caffe_weights = caffe_pb2.NetParameter() caffe_weights = caffe_pb2.NetParameter()
......
...@@ -387,6 +387,7 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -387,6 +387,7 @@ class OnnxConverter(base_converter.ConverterInterface):
self._mace_net_def = mace_pb2.NetDef() self._mace_net_def = mace_pb2.NetDef()
self._data_format = DataFormat.NCHW self._data_format = DataFormat.NCHW
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW) ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
ConverterUtil.add_data_format_arg(self._mace_net_def, self._data_format)
onnx_model = onnx.load(src_model_file) onnx_model = onnx.load(src_model_file)
ir_version = onnx_model.ir_version ir_version = onnx_model.ir_version
......
...@@ -270,6 +270,7 @@ class TensorflowConverter(base_converter.ConverterInterface): ...@@ -270,6 +270,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
self._option = option self._option = option
self._mace_net_def = mace_pb2.NetDef() self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO) ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO)
ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NHWC)
# import tensorflow graph # import tensorflow graph
tf_graph_def = tf.GraphDef() tf_graph_def = tf.GraphDef()
......
...@@ -27,6 +27,8 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType ...@@ -27,6 +27,8 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType
from mace.python.tools.converter_tool.base_converter import FrameworkType from mace.python.tools.converter_tool.base_converter import FrameworkType
from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps
from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps # noqa
from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import ReduceType from mace.python.tools.converter_tool.base_converter import ReduceType
from mace.python.tools.converter_tool.base_converter import TransformerRule from mace.python.tools.converter_tool.base_converter import TransformerRule
...@@ -77,10 +79,9 @@ class Transformer(base_converter.ConverterInterface): ...@@ -77,10 +79,9 @@ class Transformer(base_converter.ConverterInterface):
self.transpose_matmul_weight, self.transpose_matmul_weight,
TransformerRule.FOLD_FC_RESHAPE: TransformerRule.FOLD_FC_RESHAPE:
self.fold_fc_reshape, self.fold_fc_reshape,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg,
TransformerRule.ADD_IN_OUT_TENSOR_INFO: TransformerRule.ADD_IN_OUT_TENSOR_INFO:
self.add_in_out_tensor_info, self.add_in_out_tensor_info,
TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg,
TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
self.transform_global_conv_to_fc, self.transform_global_conv_to_fc,
TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
...@@ -96,6 +97,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -96,6 +97,7 @@ class Transformer(base_converter.ConverterInterface):
self.add_opencl_informations, self.add_opencl_informations,
TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution, TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format, TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.CHECK_QUANTIZE_INFO: TransformerRule.CHECK_QUANTIZE_INFO:
self.check_quantize_info, self.check_quantize_info,
TransformerRule.TRANSPOSE_CAFFE_RESHAPE_AND_FLATTEN: TransformerRule.TRANSPOSE_CAFFE_RESHAPE_AND_FLATTEN:
...@@ -194,21 +196,19 @@ class Transformer(base_converter.ConverterInterface): ...@@ -194,21 +196,19 @@ class Transformer(base_converter.ConverterInterface):
op.type = "Input" op.type = "Input"
data_type_arg = op.arg.add() data_type_arg = op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_FLOAT data_type_arg.i = input_node.data_type
op.output.extend([input_node.name]) op.output.extend([input_node.name])
output_shape = op.output_shape.add() output_shape = op.output_shape.add()
output_shape.dims.extend(input_node.shape) output_shape.dims.extend(input_node.shape)
if input_node.name in self._consumers: if input_node.data_format != DataFormat.DF_NONE:
if ConverterUtil.data_format( if input_node.data_format == DataFormat.NCHW:
self._consumers[input_node.name][0]) \
== DataFormat.NCHW:
self.transpose_shape(output_shape.dims, self.transpose_shape(output_shape.dims,
[0, 3, 1, 2]) [0, 3, 1, 2])
ConverterUtil.add_data_format_arg(op, ConverterUtil.add_data_format_arg(op,
DataFormat.NCHW) DataFormat.DF_AUTO)
else: else:
ConverterUtil.add_data_format_arg(op, ConverterUtil.add_data_format_arg(op,
DataFormat.NHWC) DataFormat.DF_NONE)
self._producer[op.output[0]] = op self._producer[op.output[0]] = op
@staticmethod @staticmethod
...@@ -256,6 +256,13 @@ class Transformer(base_converter.ConverterInterface): ...@@ -256,6 +256,13 @@ class Transformer(base_converter.ConverterInterface):
else: else:
return None return None
def get_tensor_data_format(self, tensor):
if tensor in self._producer:
producer = self._producer[tensor]
return ConverterUtil.data_format(producer)
else:
return DataFormat.DF_NONE
def consumer_count(self, tensor_name): def consumer_count(self, tensor_name):
return len(self._consumers.get(tensor_name, [])) return len(self._consumers.get(tensor_name, []))
...@@ -838,8 +845,6 @@ class Transformer(base_converter.ConverterInterface): ...@@ -838,8 +845,6 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.DepthwiseConv2d.name or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name) or op.type == MaceOp.FullyConnected.name)
and len(op.input) == 2) and len(op.input) == 2)
or (op.type == MaceOp.WinogradInverseTransform.name
and len(op.input) == 1)
or (op.type == MaceOp.Deconv2D.name or (op.type == MaceOp.Deconv2D.name
and ((ConverterUtil.get_arg( and ((ConverterUtil.get_arg(
op, op,
...@@ -930,8 +935,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -930,8 +935,7 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.Deconv2D.name or op.type == MaceOp.Deconv2D.name
or op.type == MaceOp.DepthwiseConv2d.name or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name or op.type == MaceOp.FullyConnected.name
or op.type == MaceOp.BatchNorm.name or op.type == MaceOp.BatchNorm.name) \
or op.type == MaceOp.WinogradInverseTransform.name) \
and len(self._consumers.get(op.output[0], [])) == 1: and len(self._consumers.get(op.output[0], [])) == 1:
consumer_op = self._consumers[op.output[0]][0] consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.Activation.name \ if consumer_op.type == MaceOp.Activation.name \
...@@ -1017,96 +1021,6 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1017,96 +1021,6 @@ class Transformer(base_converter.ConverterInterface):
filter_format.name) filter_format.name)
return False return False
def transpose_data_format(self):
net = self._model
for op in net.op:
# transpose args
if op.type == MaceOp.Pad.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_paddings_str:
mace_check(len(arg.ints) == 8,
"pad dim rank should be 8.")
if ConverterUtil.data_format(op) == DataFormat.NCHW:
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints,
[0, 1, 4, 5, 6, 7, 2, 3])
elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if (ConverterUtil.data_format(op) == DataFormat.NCHW
and len(op.output_shape[0].dims) == 4):
print("Transpose concat/split args: %s(%s)"
% (op.name, op.type))
if arg.i == 1:
arg.i = 3
elif arg.i == 2:
arg.i = 1
elif arg.i == 3:
arg.i = 2
producer = self._producer[op.input[0]]
input_shape = producer.output_shape[0].dims
if producer.type == MaceOp.FullyConnected.name and \
len(input_shape) == 2:
axis_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_axis_str)
if axis_arg.i == 1:
axis_arg.i = 3
elif op.type == MaceOp.Squeeze.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if ConverterUtil.data_format(op) == DataFormat.NCHW:
print("Transpose squeeze args: %s(%s)"
% (op.name, op.type))
mace_check(list(arg.ints) == [2, 3],
'only support squeeze at at [2, 3]')
arg.ints[:] = [1, 2]
elif op.type == MaceOp.Reduce.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if ConverterUtil.data_format(
op) == DataFormat.NCHW:
print("Transpose reduce args: %s(%s)"
% (op.name, op.type))
reduce_axises = list(arg.ints)
new_axises = []
for i in range(len(reduce_axises)):
idx = reduce_axises[i]
if idx == 2 or idx == 3:
new_axises.append(idx - 1)
elif idx == 1:
new_axises.append(3)
else:
new_axises.append(idx)
new_axises.sort()
arg.ints[:] = []
arg.ints.extend(new_axises)
elif op.type == MaceOp.Crop.name:
offset_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_offset_str)
mace_check(offset_arg and
ConverterUtil.data_format(op) == DataFormat.NCHW and
len(op.output_shape[0].dims) == 4,
"MACE only support crop with NCHW format")
print("Transpose crop args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])
# transpose op output shape
data_format = ConverterUtil.data_format(op)
if data_format is not None \
and data_format != DataFormat.NHWC:
print("Transpose output shapes: %s(%s)" % (op.name, op.type))
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 2, 3, 1])
return False
def add_winograd_arg(self): def add_winograd_arg(self):
if self._wino_arg == 0: if self._wino_arg == 0:
...@@ -1428,17 +1342,121 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1428,17 +1342,121 @@ class Transformer(base_converter.ConverterInterface):
def update_data_format(self): def update_data_format(self):
print("update data format") print("update data format")
data_format_flag = 1
for input_node in self._option.input_nodes.values():
if input_node.data_format.value == DataFormat.DF_NONE.value:
data_format_flag = 0
net = self._model net = self._model
for op in net.op: for op in net.op:
ConverterUtil.del_arg( df_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_data_format_str) op, MaceKeyword.mace_data_format_str)
if not df_arg:
df_arg = op.arg.add()
df_arg.name = MaceKeyword.mace_data_format_str
if op.type in MaceHasDataFormatOps:
df_arg.i = DataFormat.DF_AUTO.value
elif op.type in MaceMayHasDataFormatOps:
input_df = DataFormat.DF_AUTO.value
for input_tensor in op.input:
if input_tensor in self._consts:
continue
mace_check(input_tensor in self._producer,
"Input tensor %s not in producer" % input_tensor)
father_op = self._producer[input_tensor]
temp_input_df = ConverterUtil.get_arg(
father_op, MaceKeyword.mace_data_format_str)
if temp_input_df.i != DataFormat.DF_AUTO.value:
input_df = temp_input_df.i
if input_df == DataFormat.DF_AUTO.value:
df_arg.i = input_df
# add flag to mark the ops may has data format
has_data_format_arg = op.arg.add() has_data_format_arg = op.arg.add()
has_data_format_arg.name = MaceKeyword.mace_has_data_format_str has_data_format_arg.name = \
has_data_format_arg.i = data_format_flag MaceKeyword.mace_has_data_format_str
has_data_format_arg.i = 1
return False
def transpose_data_format(self):
print("Transpose arguments based on data format")
net = self._model
src_data_format = ConverterUtil.data_format(net)
for op in net.op:
has_data_format = ConverterUtil.data_format(op) == \
DataFormat.DF_AUTO
# transpose args
if op.type == MaceOp.Pad.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_paddings_str:
mace_check(len(arg.ints) == 8,
"pad dim rank should be 8.")
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints,
[0, 1, 4, 5, 6, 7, 2, 3])
elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if (src_data_format == DataFormat.NCHW
and has_data_format
and len(op.output_shape[0].dims) == 4):
print("Transpose concat/split args: %s(%s)"
% (op.name, op.type))
if arg.i == 1:
arg.i = 3
elif arg.i == 2:
arg.i = 1
elif arg.i == 3:
arg.i = 2
producer = self._producer[op.input[0]]
input_shape = producer.output_shape[0].dims
if producer.type == MaceOp.FullyConnected.name and \
len(input_shape) == 2:
axis_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_axis_str)
if axis_arg.i == 1:
axis_arg.i = 3
elif op.type == MaceOp.Reduce.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose reduce args: %s(%s)"
% (op.name, op.type))
reduce_axises = list(arg.ints)
new_axises = []
for i in range(len(reduce_axises)):
idx = reduce_axises[i]
if idx == 2 or idx == 3:
new_axises.append(idx - 1)
elif idx == 1:
new_axises.append(3)
else:
new_axises.append(idx)
new_axises.sort()
arg.ints[:] = []
arg.ints.extend(new_axises)
elif op.type == MaceOp.Crop.name:
offset_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_offset_str)
mace_check(offset_arg and
src_data_format == DataFormat.NCHW
and has_data_format
and len(op.output_shape[0].dims) == 4,
"MACE only support crop with NCHW format")
print("Transpose crop args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])
# transpose op output shape
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose output shapes: %s(%s)" % (op.name, op.type))
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 2, 3, 1])
return False return False
def quantize_nodes(self): def quantize_nodes(self):
...@@ -1493,7 +1511,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1493,7 +1511,7 @@ class Transformer(base_converter.ConverterInterface):
self._model.input_info[i].zero_point = quantize_info.zero_point self._model.input_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) ConverterUtil.add_data_format_arg(op_def, input_node.data_format)
# use actual ranges for model input quantize # use actual ranges for model input quantize
find_range_every_time_arg = op_def.arg.add() find_range_every_time_arg = op_def.arg.add()
find_range_every_time_arg.name = \ find_range_every_time_arg.name = \
...@@ -1516,6 +1534,7 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1516,6 +1534,7 @@ class Transformer(base_converter.ConverterInterface):
self._model.output_info[i].zero_point = quantize_info.zero_point self._model.output_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, output_node.data_format)
quantize_flag_arg = self._model.arg.add() quantize_flag_arg = self._model.arg.add()
quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str
...@@ -1886,9 +1905,6 @@ class Transformer(base_converter.ConverterInterface): ...@@ -1886,9 +1905,6 @@ class Transformer(base_converter.ConverterInterface):
shape_tensor.data_type = mace_pb2.DT_INT32 shape_tensor.data_type = mace_pb2.DT_INT32
else: else:
mace_check(False, "Only support reshape and flatten") mace_check(False, "Only support reshape and flatten")
# NCHW -> NHWC
if len(dims) == 4:
self.transpose_shape(dims, [0, 2, 3, 1])
shape_tensor.int32_data.extend(dims) shape_tensor.int32_data.extend(dims)
op.input.append(shape_tensor.name) op.input.append(shape_tensor.name)
...@@ -2030,6 +2046,9 @@ class Transformer(base_converter.ConverterInterface): ...@@ -2030,6 +2046,9 @@ class Transformer(base_converter.ConverterInterface):
data_type_arg = quantize_op.arg.add() data_type_arg = quantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_UINT8 data_type_arg.i = mace_pb2.DT_UINT8
ConverterUtil.add_data_format_arg(
quantize_op,
self.get_tensor_data_format(input_tensor))
data_type_arg = quantize_op.arg.add() data_type_arg = quantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_non_zero data_type_arg.name = MaceKeyword.mace_non_zero
...@@ -2050,8 +2069,8 @@ class Transformer(base_converter.ConverterInterface): ...@@ -2050,8 +2069,8 @@ class Transformer(base_converter.ConverterInterface):
del op.input[:] del op.input[:]
op.input.extend(quantized_inputs_names) op.input.extend(quantized_inputs_names)
orginal_output_name = op.output[0] original_output_name = op.output[0]
op.output[0] = orginal_output_name + "_quant" op.output[0] = original_output_name + "_quant"
op.output_type.extend([to_quantize_ops_output_type[op.type]]) op.output_type.extend([to_quantize_ops_output_type[op.type]])
data_type_arg = ConverterUtil.get_arg(op, data_type_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_op_data_type_str) # noqa MaceKeyword.mace_op_data_type_str) # noqa
...@@ -2064,13 +2083,15 @@ class Transformer(base_converter.ConverterInterface): ...@@ -2064,13 +2083,15 @@ class Transformer(base_converter.ConverterInterface):
dequantize_op.name = op.name + "_dequant" dequantize_op.name = op.name + "_dequant"
dequantize_op.type = MaceOp.Dequantize.name dequantize_op.type = MaceOp.Dequantize.name
dequantize_op.input.extend([op.output[0]]) dequantize_op.input.extend([op.output[0]])
dequantize_op.output.extend([orginal_output_name]) dequantize_op.output.extend([original_output_name])
dequantize_op.output_shape.extend(op.output_shape) dequantize_op.output_shape.extend(op.output_shape)
dequantize_op.output_type.extend([mace_pb2.DT_FLOAT]) dequantize_op.output_type.extend([mace_pb2.DT_FLOAT])
data_type_arg = dequantize_op.arg.add() data_type_arg = dequantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = to_quantize_ops_output_type[op.type] data_type_arg.i = to_quantize_ops_output_type[op.type]
ConverterUtil.add_data_format_arg(
dequantize_op,
self.get_tensor_data_format(original_output_name))
quantize_flag_arg = ConverterUtil.get_arg(self._model, quantize_flag_arg = ConverterUtil.get_arg(self._model,
MaceKeyword.mace_quantize_flag_arg_str) # noqa MaceKeyword.mace_quantize_flag_arg_str) # noqa
if quantize_flag_arg is None: if quantize_flag_arg is None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册