提交 74dcd617 编写于 作者: L liuqi

Refactor: Support auto transformation and net optimization.

1. Auto transformation between data format, data type and memory type.
2. Add device placement optimization strategy.
上级 bde945cd
......@@ -96,6 +96,44 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, T, fieldname) \
template<> \
void SetProtoArg<T>(Def *def, \
const std::string &arg_name, \
const T &value) { \
int size = def->arg_size(); \
for (int i = 0; i < size; ++i) { \
auto arg = def->mutable_arg(i); \
if (arg->name() == arg_name) { \
VLOG(3) << "Update old argument value from " \
<< arg->fieldname() << " to " \
<< value << " for " << arg_name; \
arg->set_##fieldname(value); \
return; \
} \
} \
VLOG(3) << "Add new argument " << arg_name << "(name: " \
<< arg_name << ", value: " << value << ")"; \
auto arg = def->add_arg(); \
arg->set_name(arg_name); \
arg->set_##fieldname(value); \
}
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(Def) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, std::string, s)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
#undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
std::string OutputMemoryTypeTagName() {
static const char *kOutputMemTypeArgName = "output_mem_type";
return kOutputMemTypeArgName;
}
bool IsQuantizedModel(const NetDef &net_def) {
return
......
......@@ -55,6 +55,18 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_;
};
template <typename T>
void SetProtoArg(OperatorDef *op_def,
const std::string &arg_name,
const T&value);
template <typename T>
void SetProtoArg(NetDef *op_def,
const std::string &arg_name,
const T&value);
std::string OutputMemoryTypeTagName();
bool IsQuantizedModel(const NetDef &def);
} // namespace mace
......
......@@ -33,7 +33,7 @@ namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze"
"Reshape", "Identity", "Squeeze", "ExpandDims"
};
return kReuseOp.count(op_type) == 1;
}
......@@ -124,8 +124,9 @@ void MemoryOptimizer::Optimize(
op_def->output_type_size());
DataType dt;
bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "has_data_format", 0) != 0;
DataFormat data_format = static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "data_format", DataFormat::DF_NONE));
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) {
......@@ -209,7 +210,7 @@ void MemoryOptimizer::Optimize(
mem_ref_count_[best_mem_id] = 1;
}
tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
dt, has_data_format));
dt, data_format));
}
}
......
......@@ -22,6 +22,7 @@
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/types.h"
namespace mace {
......@@ -81,10 +82,10 @@ class MemoryOptimizer {
struct TensorMemInfo {
int mem_id;
DataType data_type;
bool has_data_format;
DataFormat data_format;
TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
TensorMemInfo(int mem_id, DataType data_type, DataFormat data_format) :
mem_id(mem_id), data_type(data_type), data_format(data_format)
{}
};
......
......@@ -31,99 +31,8 @@
#include "mace/utils/memory.h"
#include "mace/utils/timer.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
bool TransformRequiredOp(const std::string &op_type) {
static const std::unordered_set<std::string> kNoTransformOp = {
"Shape", "InferConv2dShape"
};
return kNoTransformOp.count(op_type) == 0;
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_.get());
construct_context->set_operator_def(op_def);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
// Get available devices
auto available_devices =
op_registry->AvailableDevices(op_def->type(), construct_context);
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
return op_registry->CreateOperation(construct_context, device_type);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def,
Workspace *ws,
......@@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(),
&target_device->cpu_runtime()->thread_pool())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
// Tensor Shape map
std::unordered_map<std::string, std::vector<index_t>> tensor_shape_map;
for (auto &op : net_def->op()) {
if (op.output_size() != op.output_shape_size()) {
continue;
}
for (int i = 0; i < op.output_size(); ++i) {
tensor_shape_map[op.output(i)] = std::vector<index_t>(
op.output_shape(i).dims().begin(),
op.output_shape(i).dims().end());
}
}
for (auto &tensor : net_def->tensors()) {
tensor_shape_map[tensor.name()] =
std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
}
bool has_data_format = false;
if (target_device_->device_type() == DeviceType::CPU) {
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
// Only could be NONE or NHWC
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = has_data_format ||
(input_data_format != DataFormat::DF_NONE);
if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
}
}
}
#ifdef MACE_ENABLE_OPENCL
// output tensor : related information
std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_set<std::string> transformed_set;
// add input information
MemoryType target_mem_type;
// default data format of output tensor
DataFormat default_output_df = DataFormat::DF_NONE;
if (target_device_->device_type() == DeviceType::GPU) {
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = input_data_format != DataFormat::DF_NONE;
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_data_format,
input_shape, -1));
}
default_output_df =
has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_, &tensor_shape_map);
OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) {
std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create operation
auto op = CreateOperation(op_registry,
&construct_context,
op_def,
has_data_format,
is_quantize_model);
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
// the outputs' memory type of the operation
MemoryType out_mem_type = construct_context.output_mem_type();
int input_size = op_def->input_size();
// if op is memory-unused op, no transformation
if (TransformRequiredOp(op_def->type())) {
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether to do transform
MemoryType wanted_in_mem_type =
construct_context.GetInputMemType(i);
DataType wanted_in_dt = construct_context.GetInputDataType(i);
if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
|| output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
auto t_input_name = TransformedName(op_def->input(i),
wanted_in_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_set.count(t_input_name) == 0) {
VLOG(1) << "Add Transform operation " << op_def->name()
<< " to transform tensor "
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to "
<< wanted_in_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< wanted_in_dt << ". with data format "
<< output_info.data_format;
std::string input_name = op_def->input(i);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
output_info.data_format == DataFormat::NCHW &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name, wanted_in_dt,
construct_context.GetInputOpenCLBufferType(i),
wanted_in_mem_type, has_data_format);
OpConstructContext t_construct_context(ws_);
auto transform_op = CreateOperation(
op_registry,
&t_construct_context,
transform_op_def,
has_data_format);
operators_.emplace_back(std::move(transform_op));
transformed_set.insert(t_input_name);
output_mem_map[t_input_name] = wanted_in_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, t_input_name);
}
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
}
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
DataType dt;
if (op_def->output_type_size() == op_def->output_size()) {
dt = op_def->output_type(out_idx);
} else {
dt = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
}
output_mem_map[op_def->output(out_idx)] = out_mem_type;
output_map.emplace(
op_def->output(out_idx),
InternalOutputInfo(
out_mem_type,
dt,
default_output_df,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
}
auto op_device_type = static_cast<DeviceType>(op_def->device_type());
if (op_device_type == target_device_->device_type()) {
construct_context.set_device(target_device_);
} else if (op_device_type == DeviceType::CPU) {
construct_context.set_device(cpu_device_.get());
} else {
LOG(FATAL) << "Encounter unexpected error: "
<< op_device_type << " vs " << target_device_->device_type();
}
#endif // MACE_ENABLE_OPENCL
construct_context.set_operator_def(op_def);
auto op = op_registry->CreateOperation(&construct_context,
op_device_type);
operators_.emplace_back(std::move(op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) {
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != output_info.data_type()) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << output_info.data_type();
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
bool output_has_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
output_info.data_type(),
OpenCLBufferType::IN_OUT_CHANNEL,
target_mem_type,
output_has_data_format);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_has_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
if (target_device_->device_type() == DeviceType::GPU) {
// update the map : output_tensor -> Operation
MemoryType out_mem_type =
static_cast<MemoryType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
net_def->op(idx), OutputMemoryTypeTagName(),
static_cast<int>(MemoryType::CPU_BUFFER)));
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
output_mem_map[op_def->output(out_idx)] = out_mem_type;
}
}
}
#endif // MACE_ENABLE_OPENCL
}
// Update output tensor reference
for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name());
......
......@@ -54,14 +54,6 @@ class SerialNet : public NetBase {
MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model = false);
protected:
Workspace *ws_;
Device *target_device_;
......
此差异已折叠。
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_DEF_ADAPTER_H_
#define MACE_CORE_NET_DEF_ADAPTER_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/operator.h"
#include "mace/core/net_optimizer.h"
namespace mace {
class OpRegistryBase;
class Workspace;
class Device;
/**
* Conventions:
* 1. DataFormat::DT_AUTO stands for formatted (NHWC or NCHW)
* 2. if Op with DataFormat::DT_AUTO, the arguments of this op
* is formatted to NHWC
*/
class NetDefAdapter {
public:
NetDefAdapter(const OpRegistryBase *op_registry,
const Workspace *ws);
MaceStatus AdaptNetDef(
const NetDef *net_def,
Device *target_device,
NetDef *target_net_def);
public:
NetDefAdapter(const NetDefAdapter&) = delete;
NetDefAdapter(const NetDefAdapter&&) = delete;
NetDefAdapter &operator=(const NetDefAdapter &) = delete;
NetDefAdapter &operator=(const NetDefAdapter &&) = delete;
private:
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type;
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
typedef std::unordered_map<std::string, InternalOutputInfo> TensorInfoMap;
private:
MaceStatus AdaptDevice(OpConditionContext *context,
Device *target_device,
Device *cpu_device,
const TensorInfoMap &output_map,
const NetDef *net_def,
OperatorDef *op);
MaceStatus AdaptDataType(OpConditionContext *context,
OperatorDef *op);
MaceStatus AdaptDataFormat(
OpConditionContext *context,
OperatorDef *op,
bool is_quantized_model,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
DataFormat *op_output_df,
NetDef *target_net_def);
MaceStatus AdaptMemoryType(
mace::OpConditionContext *context,
mace::OperatorDef *op_def,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
MemoryType *op_output_mem_types,
mace::NetDef *target_net_def);
std::string DebugString(const NetDef *net_def);
private:
const OpRegistryBase *op_registry_;
const Workspace *ws_;
NetOptimizer net_optimizer_;
};
} // namespace mace
#endif // MACE_CORE_NET_DEF_ADAPTER_H_
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/net_optimizer.h"
#include <string>
namespace mace {
DeviceType NetOptimizer::SelectBestDevice(
const mace::OperatorDef *op_def,
DeviceType target_device_type,
const std::set<mace::DeviceType> &available_devices,
const std::vector<mace::DeviceType> &inputs_op_devices) {
static const std::set<std::string> kComputeIntensiveOps = {
"Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
"FullyConnected"
};
// CPU is the device to fall back
DeviceType best_device = DeviceType::CPU;
if (available_devices.count(target_device_type) == 1) {
best_device = target_device_type;
}
if (best_device == DeviceType::CPU) {
return best_device;
}
// Put compute-intensive ops in target device
if (kComputeIntensiveOps.count(op_def->type()) == 1) {
return best_device;
}
// Greedy strategy: Use input op's device type as current op's device
for (auto device_type : inputs_op_devices) {
if (device_type != best_device) {
best_device = device_type;
}
}
return best_device;
}
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_OPTIMIZER_H_
#define MACE_CORE_NET_OPTIMIZER_H_
#include <set>
#include <vector>
#include "mace/port/port.h"
#include "mace/proto/mace.pb.h"
namespace mace {
class NetOptimizer {
public:
DeviceType SelectBestDevice(const OperatorDef *op_def,
DeviceType target_device,
const std::set<DeviceType> &available_devices,
const std::vector<DeviceType> &inputs_op_devices);
};
} // namespace mace
#endif // MACE_CORE_NET_OPTIMIZER_H_
......@@ -20,34 +20,21 @@
#include "mace/core/operator.h"
namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr),
tensor_shape_info_(nullptr) {}
OpConstructContext::OpConstructContext(
mace::Workspace *ws,
mace::OpConstructContext::TensorShapeMap *info)
OpConditionContext::OpConditionContext(
const mace::Workspace *ws,
mace::OpConditionContext::TensorShapeMap *info)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr),
tensor_shape_info_(info) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) {
void OpConditionContext::set_operator_def(
const mace::OperatorDef *operator_def) {
operator_def_ = operator_def;
input_data_types_.clear();
}
void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
void OpConstructContext::SetInputInfo(size_t idx,
void OpConditionContext::SetInputInfo(size_t idx,
mace::MemoryType mem_type,
mace::DataType dt) {
if (input_mem_types_.empty()) {
......@@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx,
input_data_types_[idx] = dt;
}
MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
void OpConditionContext::set_output_mem_type(mace::MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
if (input_mem_types_.empty()) {
return output_mem_type_;
}
......@@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
return input_mem_types_[idx];
}
DataType OpConstructContext::GetInputDataType(size_t idx) const {
DataType OpConditionContext::GetInputDataType(size_t idx) const {
if (input_data_types_.empty()) {
// the default inputs' data types are same as operation's data type.
return static_cast<DataType>(
......@@ -87,17 +80,17 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
}
#ifdef MACE_ENABLE_OPENCL
void OpConstructContext::SetInputOpenCLBufferType(
void OpConditionContext::SetInputOpenCLBufferType(
size_t idx, OpenCLBufferType buffer_type) {
if (input_opencl_buffer_types_.empty()) {
// the default inputs' memory types are same as output memory type.
input_opencl_buffer_types_.resize(operator_def_->input_size(),
OpenCLBufferType::IN_OUT_CHANNEL);
OpenCLBufferType::IN_OUT_CHANNEL);
}
MACE_CHECK(idx < input_opencl_buffer_types_.size());
input_opencl_buffer_types_[idx] = buffer_type;
}
OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
size_t idx) const {
if (input_opencl_buffer_types_.empty()) {
return OpenCLBufferType::IN_OUT_CHANNEL;
......@@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) {
operator_def_ = operator_def;
}
OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {}
......@@ -202,16 +205,26 @@ const std::string OpKeyBuilder::Build() {
} // namespace
OpRegistrationInfo::OpRegistrationInfo() {
device_placer = [this](OpConstructContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
// The GPU ops only support 4D In/Out tensor by default
if (this->devices.count(DeviceType::CPU) == 1 &&
op->output_shape_size() == op->output_size() &&
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
// default device type placer
device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
MACE_UNUSED(context);
return this->devices;
};
// default input and output memory type setter
memory_type_setter = [](OpConditionContext *context) -> void {
if (context->device()->device_type() == DeviceType::GPU) {
#ifdef MACE_ENABLE_OPENCL
if (context->device()->gpu_runtime()->UseImageMemory()) {
context->set_output_mem_type(MemoryType::GPU_IMAGE);
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
#endif // MACE_ENABLE_OPENCL
} else {
context->set_output_mem_type(MemoryType::CPU_BUFFER);
}
};
}
void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
......@@ -255,13 +268,21 @@ MaceStatus OpRegistryBase::Register(
}
const std::set<DeviceType> OpRegistryBase::AvailableDevices(
const std::string &op_type, OpConstructContext *context) const {
const std::string &op_type, OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->device_placer(context);
}
void OpRegistryBase::GetInOutMemoryTypes(
const std::string &op_type,
mace::OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->memory_type_setter(context);
}
std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context,
DeviceType device_type) const {
......@@ -269,15 +290,6 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT)));
if (device_type == DeviceType::CPU && dtype == DT_HALF) {
int arg_size = operator_def->arg_size();
for (int i = 0; i < arg_size; ++i) {
if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type;
......@@ -308,9 +320,20 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
return *this;
}
OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
mace::OpRegistrationInfo::MemoryTypeSetter setter) {
memory_type_setter_ = setter;
return *this;
}
void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
if (info != nullptr && placer_) {
info->device_placer = placer_;
if (info != nullptr) {
if (placer_) {
info->device_placer = placer_;
}
if (memory_type_setter_) {
info->memory_type_setter = memory_type_setter_;
}
}
}
......
......@@ -32,22 +32,20 @@
namespace mace {
// memory_optimizer, device
class OpConstructContext {
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
// OpConditionContext has all information used for choosing proper Op
class OpConditionContext {
public:
explicit OpConstructContext(Workspace *ws);
OpConstructContext(Workspace *ws, TensorShapeMap *info);
~OpConstructContext() = default;
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default;
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
void set_operator_def(const OperatorDef* operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
inline const OperatorDef *operator_def() const {
return operator_def_;
}
inline Workspace *workspace() const {
inline const Workspace *workspace() const {
return ws_;
}
......@@ -81,8 +79,8 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
const OperatorDef *operator_def_;
const Workspace *ws_;
Device *device_;
TensorShapeMap *tensor_shape_info_;
// used for memory transform
......@@ -94,6 +92,38 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL
};
// memory_optimizer, device
class OpConstructContext {
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
public:
explicit OpConstructContext(Workspace *ws);
~OpConstructContext() = default;
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_;
}
inline Workspace *workspace() const {
return ws_;
}
inline void set_device(Device* device) {
device_ = device;
}
inline Device *device() const {
return device_;
}
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
Device *device_;
};
// memory_optimizer, device
class OpInitContext {
public:
......@@ -207,8 +237,11 @@ struct OpRegistrationInfo {
public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
OpCreator;
typedef std::function<std::set<DeviceType>(OpConstructContext *)>
typedef std::function<std::set<DeviceType>(OpConditionContext *)>
DevicePlacer;
typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
DataFormatSelector;
OpRegistrationInfo();
......@@ -219,6 +252,8 @@ struct OpRegistrationInfo {
std::set<DeviceType> devices;
std::unordered_map<std::string, OpCreator> creators;
DevicePlacer device_placer;
MemoryTypeSetter memory_type_setter;
DataFormatSelector data_format_selector;
};
class OpConditionBuilder {
......@@ -230,11 +265,18 @@ class OpConditionBuilder {
OpConditionBuilder &SetDevicePlacerFunc(
OpRegistrationInfo::DevicePlacer placer);
// If you set input memory type for specified Op,
// you must call OpConditionContext::set_output_mem_type
OpConditionBuilder &SetInputMemoryTypeSetter(
OpRegistrationInfo::MemoryTypeSetter setter);
void Finalize(OpRegistrationInfo *info) const;
private:
std::string type_;
OpRegistrationInfo::DevicePlacer placer_;
OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
OpRegistrationInfo::DataFormatSelector data_format_selector_;
};
......@@ -250,7 +292,10 @@ class OpRegistryBase {
MaceStatus Register(const OpConditionBuilder &builder);
const std::set<DeviceType> AvailableDevices(
const std::string &op_type, OpConstructContext *context) const;
const std::string &op_type, OpConditionContext *context) const;
void GetInOutMemoryTypes(
const std::string &op_type, OpConditionContext *context) const;
std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context,
......
......@@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
}
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
void OpenCLUtil::BuildTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const OpenCLBufferType buffer_type,
const mace::MemoryType mem_type,
bool has_data_format) {
std::unique_ptr<OperatorDef> op(new OperatorDef);
DataFormat data_format,
OperatorDef *op_def) {
std::string op_name = "mace_node_" + output_name;
op->set_name(op_name);
op->set_type("BufferTransform");
op->add_input(input_name);
op->add_output(output_name);
Argument *arg = op->add_arg();
op_def->set_name(op_name);
op_def->set_type("BufferTransform");
op_def->add_input(input_name);
op_def->add_output(output_name);
op_def->set_device_type(DeviceType::GPU);
Argument *arg = op_def->add_arg();
arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(buffer_type));
arg = op->add_arg();
arg = op_def->add_arg();
arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg();
arg = op_def->add_arg();
arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg();
arg->set_name("has_data_format");
arg->set_i(has_data_format);
arg = op_def->add_arg();
arg->set_name("data_format");
arg->set_i(data_format);
if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape();
OutputShape *shape = op_def->add_output_shape();
for (auto value : input_shape) {
shape->add_dims(value);
}
}
return std::move(op);
}
} // namespace mace
......@@ -43,14 +43,15 @@ class OpenCLUtil {
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef(
static void BuildTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
bool has_data_format);
DataFormat data_format,
OperatorDef *op_def);
};
} // namespace mace
......
......@@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor(
}
}
VLOG(1) << "Preallocate buffer to tensors";
bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
tensor_mem.second.data_type,
false, tensor_mem.first));
if (tensor_mem.second.has_data_format) {
tensor->set_data_format(tensor_mem.second.data_format);
if (tensor_mem.second.data_format != DataFormat::DF_NONE) {
if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first
......@@ -279,22 +279,12 @@ MaceStatus Workspace::PreallocateOutputTensor(
<< tensor->UnderlyingBuffer()->shape()[0]
<< ", "
<< tensor->UnderlyingBuffer()->shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.mem_id
<< " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_BUFFER ||
is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else {
tensor->set_data_format(DataFormat::NCHW);
}
}
} else {
tensor->set_data_format(DataFormat::DF_NONE);
}
tensor_map_[tensor_mem.first] = std::move(tensor);
}
......
......@@ -27,6 +27,7 @@
#include "mace/public/mace.h"
#include "mace/port/env.h"
#include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
......@@ -512,26 +513,32 @@ MaceStatus MaceEngine::Impl::Init(
}
} else {
#endif
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
device_.get(),
model_data));
MemoryOptimizer mem_optimizer;
// Init model
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def,
ws_.get(),
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
&mem_optimizer,
device_.get()));
if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
}
MACE_RETURN_IF_ERROR(net_->Init());
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
device_.get(),
model_data));
NetDef adapted_net_def;
NetDefAdapter net_def_adapter(op_registry_.get(), ws_.get());
net_def_adapter.AdaptNetDef(net_def, device_.get(), &adapted_net_def);
MemoryOptimizer mem_optimizer;
// Init model
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
&adapted_net_def,
ws_.get(),
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(adapted_net_def,
&mem_optimizer,
device_.get()));
if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(adapted_net_def,
model_data,
device_->allocator());
}
MACE_RETURN_IF_ERROR(net_->Init());
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
}
#endif
......
......@@ -15,6 +15,8 @@
#include "mace/ops/activation.h"
#include <memory>
#include <set>
#include "mace/core/operator.h"
#if defined(MACE_ENABLE_NEON)
......@@ -132,6 +134,22 @@ void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -103,6 +103,22 @@ void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -145,6 +145,22 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -39,14 +39,14 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
auto type =
static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(CONV2D_FILTER)));
bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0)
!= 0;
DataFormat data_format = static_cast<DataFormat>(
Operation::GetOptionalArg<int>("data_format", DataFormat::DF_NONE));
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_,
has_data_format, output);
data_format, output);
}
private:
......
......@@ -116,10 +116,10 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("ChannelShuffle")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
return { DeviceType::CPU };
}
int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "group", 1);
......
......@@ -241,13 +241,11 @@ void RegisterConcat(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Concat")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
if (op->output_shape(0).dims_size() != 4) {
if (op->output_shape_size() != op->output_size() ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
} else {
int has_data_format =
......
......@@ -466,7 +466,6 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
}
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd(
......
......@@ -145,6 +145,22 @@ void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
(op->output_shape_size() != op->output_size()) ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -197,7 +197,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
}
}
MaceStatus Run(OpContext *context) override {
......@@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
MACE_NOT_IMPLEMENTED;
}
FrameworkType framework_type =
static_cast<ops::FrameworkType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*(context->operator_def()), "framework_type",
FrameworkType::TENSORFLOW));
if (framework_type == FrameworkType::TENSORFLOW) {
context->SetInputInfo(2, MemoryType::CPU_BUFFER,
DataType::DT_INT32);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -382,7 +382,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
}
context->set_output_mem_type(mem_type);
Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) {
......@@ -393,8 +392,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
1,
OpenCLBufferType::DW_CONV2D_FILTER,
mem_type) == MaceStatus::MACE_SUCCESS);
} else {
context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
......@@ -440,6 +437,27 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
mem_type = MemoryType::GPU_BUFFER;
}
auto filter_tensor = context->workspace()->GetTensor(
context->operator_def()->input(1));
if (filter_tensor == nullptr || !filter_tensor->is_weight()) {
context->SetInputOpenCLBufferType(
1, OpenCLBufferType::DW_CONV2D_FILTER);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -14,7 +14,6 @@
#include "mace/core/operator.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/math.h"
namespace mace {
......@@ -44,27 +43,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
std::vector<index_t> output_shape(input_shape);
output_shape.insert(output_shape.begin() + axis_, 1);
bool has_data_format = Operation::GetOptionalArg<int>(
"has_data_format", 0) == 1;
if (has_data_format && output_shape.size() == 4) {
// only tensorflow support expand dim, so the default format is NHWC
// transform NHWC to NCHW
auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
{0, 3, 1, 2});
output->Resize(t_output_shape);
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
Transpose(&context->device()->cpu_runtime()->thread_pool(),
input_data, output_shape, {0, 3, 1, 2}, output_data);
} else {
output->Resize(output_shape);
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<T>();
output->Copy<T>(input_data, input->size());
}
output->ReuseTensorBuffer(*input);
output->Reshape(output_shape);
return MaceStatus::MACE_SUCCESS;
}
......
......@@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -23,7 +23,6 @@
#include "mace/ops/opencl/image/buffer_to_image.h"
#include "mace/ops/opencl/image/image_to_buffer.h"
#include "mace/ops/opencl/buffer/buffer_transform.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/memory.h"
namespace mace {
......@@ -48,7 +47,7 @@ class OpenCLBufferTransformer {
const OpenCLBufferType type,
const MemoryType out_mem_type,
const int wino_blk_size,
bool has_data_format,
DataFormat data_format,
Tensor *output) {
Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value;
......@@ -67,31 +66,12 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform CPU Buffer " << input->name()
<< " to GPU Buffer " << internal_tensor->name()
<< " with data type " << dt;
if (has_data_format && input->shape().size() == 4) {
// 1. (NCHW -> NHWC)
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(input->shape(),
dst_dims);
internal_tensor->Resize(output_shape);
internal_tensor->set_data_format(DataFormat::NHWC);
// TODO(liuqi): Only support float now
const float *input_ptr = input->data<float>();
Tensor::MappingGuard guard(internal_tensor);
float *internal_ptr = internal_tensor->mutable_data<float>();
MACE_RETURN_IF_ERROR(ops::Transpose(
&context->device()->cpu_runtime()->thread_pool(),
input_ptr,
input->shape(),
dst_dims,
internal_ptr));
} else {
internal_tensor->Resize(input->shape());
const uint8_t *input_ptr = input->data<uint8_t>();
Tensor::MappingGuard guard(internal_tensor);
uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
memcpy(internal_ptr, input_ptr, input->raw_size());
}
MACE_CHECK(data_format == DataFormat::NHWC);
internal_tensor->Resize(input->shape());
const uint8_t *input_ptr = input->data<uint8_t>();
Tensor::MappingGuard guard(internal_tensor);
uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
memcpy(internal_ptr, input_ptr, input->raw_size());
// 2. convert the internal GPU Buffer to output.
return kernel_->Compute(
context, internal_tensor, type, wino_blk_size, output);
......@@ -108,30 +88,13 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
<< " to CPU Buffer " << output->name()
<< " with data type " << dt;
if (has_data_format && internal_tensor.shape().size() == 4) {
// NHWC -> NCHW
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(internal_tensor.shape(),
dst_dims);
output->set_data_format(DataFormat::NCHW);
Tensor::MappingGuard guard(&internal_tensor);
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(output_shape);
float *output_ptr = output->mutable_data<float>();
return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(),
internal_ptr,
internal_tensor.shape(),
dst_dims,
output_ptr);
} else {
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
return MaceStatus::MACE_SUCCESS;
}
MACE_CHECK(data_format == DataFormat::NHWC);
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
return MaceStatus::MACE_SUCCESS;
} else {
LOG(FATAL) << "Unexpected error: " << out_mem_type;
return MaceStatus::MACE_SUCCESS;
......
......@@ -71,14 +71,17 @@ MaceStatus EltwiseKernel<T>::Compute(
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK(input0->dim_size() == input1->dim_size() ||
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
......
......@@ -59,11 +59,6 @@ MaceStatus ReduceKernel<T>::Compute(
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
MACE_CHECK(input->dim_size() == 4,
"reduce gpu only support 4-dim input");
MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
"reduce gpu only support 1,2-axis reduce");
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......
......@@ -480,7 +480,6 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
}
}
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <memory>
#include <set>
#include <vector>
#include "mace/core/future.h"
......@@ -907,6 +908,31 @@ void RegisterReduce(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Reduce")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
bool keep_dims =
ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
*op, "keepdims", false);
if (!keep_dims) {
return { DeviceType::CPU };
}
auto axis =
ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
*op, "axis");
if (axis.size() != 2 || axis[0] != 1 || axis[1] == 2) {
return { DeviceType::CPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (tensor_shape_info->count(op->input(0)) == 0
|| tensor_shape_info->at(op->input(0)).size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -100,11 +100,7 @@ class ScalarMathOp : public Operation {
coeff_(Operation::GetRepeatedArgs<float>("coeff")),
scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
scalar_input_index_(Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1)) {
if (D == DeviceType::GPU) {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
}
"scalar_input_index", 1)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......
......@@ -417,7 +417,6 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
if (context->device()->gpu_runtime()->UseImageMemory()) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
}
}
......@@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Softmax")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
......
......@@ -144,10 +144,10 @@ void RegisterSplit(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Split")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return {DeviceType::CPU, DeviceType::GPU};
return { DeviceType::CPU };
}
int axis = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "axis", 3);
......
......@@ -77,7 +77,7 @@ void RegisterSqueeze(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Squeeze")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
......
......@@ -36,7 +36,8 @@ enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3, HTA = 4 };
enum DataFormat {
DF_NONE = 0, NHWC = 1, NCHW = 2,
HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103
HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103,
DF_AUTO = 1000,
};
enum GPUPerfHint {
......
......@@ -33,6 +33,7 @@ class DataFormat(Enum):
OIHW = 101
HWOI = 102
OHWI = 103
DF_AUTO = 1000
# SAME_LOWER: if the amount of paddings to be added is odd,
......@@ -161,13 +162,39 @@ MaceSupportedOps = [
'SumGroup',
'TargetRMSNorm',
'Transpose',
'WinogradInverseTransform',
'WinogradTransform',
'Cumsum',
]
MaceOp = Enum('MaceOp', [(op, op) for op in MaceSupportedOps], type=str)
MaceHasDataFormatOps = [MaceOp.BatchNorm,
MaceOp.BatchToSpaceND,
MaceOp.Conv2D,
MaceOp.Deconv2D,
MaceOp.DepthToSpace,
MaceOp.DepthwiseConv2d,
MaceOp.DepthwiseDeconv2d,
MaceOp.FullyConnected,
MaceOp.Pooling,
MaceOp.ResizeBicubic,
MaceOp.ResizeBilinear,
MaceOp.ResizeNearestNeighbor,
MaceOp.SpaceToBatchND,
MaceOp.SpaceToDepth]
MaceMayHasDataFormatOps = [MaceOp.Activation,
MaceOp.AddN,
MaceOp.BiasAdd,
MaceOp.ChannelShuffle,
MaceOp.Concat,
MaceOp.Crop,
MaceOp.Eltwise,
MaceOp.Pad,
MaceOp.Reduce,
MaceOp.Softmax,
MaceOp.Split,
MaceOp.SqrDiffMean]
class MaceKeyword(object):
# node related str
......@@ -505,12 +532,11 @@ class ConverterOption(object):
TransformerRule.TRANSFORM_CHANNEL_SHUFFLE,
# Model data format related transformation
TransformerRule.TRANSPOSE_FILTERS,
TransformerRule.TRANSPOSE_DATA_FORMAT,
# Mace model structure related transformation
TransformerRule.ADD_IN_OUT_TENSOR_INFO,
TransformerRule.TRANSPOSE_MATMUL_WEIGHT,
# Add winograd argument
TransformerRule.ADD_WINOGRAD_ARG,
# Mace model structure related transformation
TransformerRule.ADD_IN_OUT_TENSOR_INFO,
# Data type related transformation
TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE,
# Transform finalization
......@@ -519,6 +545,7 @@ class ConverterOption(object):
TransformerRule.SORT_BY_EXECUTION,
# update the data format of ops
TransformerRule.UPDATE_DATA_FORMAT,
TransformerRule.TRANSPOSE_DATA_FORMAT,
# Need to be put after SORT_BY_EXECUTION
TransformerRule.ADD_QUANTIZE_TENSOR_RANGE,
]
......@@ -571,6 +598,8 @@ class ConverterUtil(object):
return DataFormat.NHWC
elif arg.i == DataFormat.NCHW.value:
return DataFormat.NCHW
elif arg.i == DataFormat.DF_AUTO.value:
return DataFormat.DF_AUTO
else:
return None
......
......@@ -195,6 +195,7 @@ class CaffeConverter(base_converter.ConverterInterface):
self._option = option
self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NCHW)
self._caffe_net = CaffeNet()
self._caffe_layers = caffe_pb2.NetParameter()
caffe_weights = caffe_pb2.NetParameter()
......
......@@ -387,6 +387,7 @@ class OnnxConverter(base_converter.ConverterInterface):
self._mace_net_def = mace_pb2.NetDef()
self._data_format = DataFormat.NCHW
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.OIHW)
ConverterUtil.add_data_format_arg(self._mace_net_def, self._data_format)
onnx_model = onnx.load(src_model_file)
ir_version = onnx_model.ir_version
......
......@@ -270,6 +270,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
self._option = option
self._mace_net_def = mace_pb2.NetDef()
ConverterUtil.set_filter_format(self._mace_net_def, DataFormat.HWIO)
ConverterUtil.add_data_format_arg(self._mace_net_def, DataFormat.NHWC)
# import tensorflow graph
tf_graph_def = tf.GraphDef()
......
......@@ -27,6 +27,8 @@ from mace.python.tools.converter_tool.base_converter import EltwiseType
from mace.python.tools.converter_tool.base_converter import FrameworkType
from mace.python.tools.converter_tool.base_converter import MaceKeyword
from mace.python.tools.converter_tool.base_converter import MaceOp
from mace.python.tools.converter_tool.base_converter import MaceHasDataFormatOps
from mace.python.tools.converter_tool.base_converter import MaceMayHasDataFormatOps # noqa
from mace.python.tools.converter_tool.base_converter import PaddingMode
from mace.python.tools.converter_tool.base_converter import ReduceType
from mace.python.tools.converter_tool.base_converter import TransformerRule
......@@ -77,10 +79,9 @@ class Transformer(base_converter.ConverterInterface):
self.transpose_matmul_weight,
TransformerRule.FOLD_FC_RESHAPE:
self.fold_fc_reshape,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg,
TransformerRule.ADD_IN_OUT_TENSOR_INFO:
self.add_in_out_tensor_info,
TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg,
TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
self.transform_global_conv_to_fc,
TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
......@@ -96,6 +97,7 @@ class Transformer(base_converter.ConverterInterface):
self.add_opencl_informations,
TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
TransformerRule.UPDATE_DATA_FORMAT: self.update_data_format,
TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
TransformerRule.CHECK_QUANTIZE_INFO:
self.check_quantize_info,
TransformerRule.TRANSPOSE_CAFFE_RESHAPE_AND_FLATTEN:
......@@ -194,21 +196,19 @@ class Transformer(base_converter.ConverterInterface):
op.type = "Input"
data_type_arg = op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_FLOAT
data_type_arg.i = input_node.data_type
op.output.extend([input_node.name])
output_shape = op.output_shape.add()
output_shape.dims.extend(input_node.shape)
if input_node.name in self._consumers:
if ConverterUtil.data_format(
self._consumers[input_node.name][0]) \
== DataFormat.NCHW:
if input_node.data_format != DataFormat.DF_NONE:
if input_node.data_format == DataFormat.NCHW:
self.transpose_shape(output_shape.dims,
[0, 3, 1, 2])
ConverterUtil.add_data_format_arg(op,
DataFormat.NCHW)
else:
ConverterUtil.add_data_format_arg(op,
DataFormat.NHWC)
ConverterUtil.add_data_format_arg(op,
DataFormat.DF_AUTO)
else:
ConverterUtil.add_data_format_arg(op,
DataFormat.DF_NONE)
self._producer[op.output[0]] = op
@staticmethod
......@@ -256,6 +256,13 @@ class Transformer(base_converter.ConverterInterface):
else:
return None
def get_tensor_data_format(self, tensor):
if tensor in self._producer:
producer = self._producer[tensor]
return ConverterUtil.data_format(producer)
else:
return DataFormat.DF_NONE
def consumer_count(self, tensor_name):
return len(self._consumers.get(tensor_name, []))
......@@ -838,8 +845,6 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name)
and len(op.input) == 2)
or (op.type == MaceOp.WinogradInverseTransform.name
and len(op.input) == 1)
or (op.type == MaceOp.Deconv2D.name
and ((ConverterUtil.get_arg(
op,
......@@ -930,8 +935,7 @@ class Transformer(base_converter.ConverterInterface):
or op.type == MaceOp.Deconv2D.name
or op.type == MaceOp.DepthwiseConv2d.name
or op.type == MaceOp.FullyConnected.name
or op.type == MaceOp.BatchNorm.name
or op.type == MaceOp.WinogradInverseTransform.name) \
or op.type == MaceOp.BatchNorm.name) \
and len(self._consumers.get(op.output[0], [])) == 1:
consumer_op = self._consumers[op.output[0]][0]
if consumer_op.type == MaceOp.Activation.name \
......@@ -1017,96 +1021,6 @@ class Transformer(base_converter.ConverterInterface):
filter_format.name)
return False
def transpose_data_format(self):
net = self._model
for op in net.op:
# transpose args
if op.type == MaceOp.Pad.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_paddings_str:
mace_check(len(arg.ints) == 8,
"pad dim rank should be 8.")
if ConverterUtil.data_format(op) == DataFormat.NCHW:
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints,
[0, 1, 4, 5, 6, 7, 2, 3])
elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if (ConverterUtil.data_format(op) == DataFormat.NCHW
and len(op.output_shape[0].dims) == 4):
print("Transpose concat/split args: %s(%s)"
% (op.name, op.type))
if arg.i == 1:
arg.i = 3
elif arg.i == 2:
arg.i = 1
elif arg.i == 3:
arg.i = 2
producer = self._producer[op.input[0]]
input_shape = producer.output_shape[0].dims
if producer.type == MaceOp.FullyConnected.name and \
len(input_shape) == 2:
axis_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_axis_str)
if axis_arg.i == 1:
axis_arg.i = 3
elif op.type == MaceOp.Squeeze.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if ConverterUtil.data_format(op) == DataFormat.NCHW:
print("Transpose squeeze args: %s(%s)"
% (op.name, op.type))
mace_check(list(arg.ints) == [2, 3],
'only support squeeze at at [2, 3]')
arg.ints[:] = [1, 2]
elif op.type == MaceOp.Reduce.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if ConverterUtil.data_format(
op) == DataFormat.NCHW:
print("Transpose reduce args: %s(%s)"
% (op.name, op.type))
reduce_axises = list(arg.ints)
new_axises = []
for i in range(len(reduce_axises)):
idx = reduce_axises[i]
if idx == 2 or idx == 3:
new_axises.append(idx - 1)
elif idx == 1:
new_axises.append(3)
else:
new_axises.append(idx)
new_axises.sort()
arg.ints[:] = []
arg.ints.extend(new_axises)
elif op.type == MaceOp.Crop.name:
offset_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_offset_str)
mace_check(offset_arg and
ConverterUtil.data_format(op) == DataFormat.NCHW and
len(op.output_shape[0].dims) == 4,
"MACE only support crop with NCHW format")
print("Transpose crop args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])
# transpose op output shape
data_format = ConverterUtil.data_format(op)
if data_format is not None \
and data_format != DataFormat.NHWC:
print("Transpose output shapes: %s(%s)" % (op.name, op.type))
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 2, 3, 1])
return False
def add_winograd_arg(self):
if self._wino_arg == 0:
......@@ -1428,17 +1342,121 @@ class Transformer(base_converter.ConverterInterface):
def update_data_format(self):
print("update data format")
data_format_flag = 1
for input_node in self._option.input_nodes.values():
if input_node.data_format.value == DataFormat.DF_NONE.value:
data_format_flag = 0
net = self._model
for op in net.op:
ConverterUtil.del_arg(
df_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_data_format_str)
has_data_format_arg = op.arg.add()
has_data_format_arg.name = MaceKeyword.mace_has_data_format_str
has_data_format_arg.i = data_format_flag
if not df_arg:
df_arg = op.arg.add()
df_arg.name = MaceKeyword.mace_data_format_str
if op.type in MaceHasDataFormatOps:
df_arg.i = DataFormat.DF_AUTO.value
elif op.type in MaceMayHasDataFormatOps:
input_df = DataFormat.DF_AUTO.value
for input_tensor in op.input:
if input_tensor in self._consts:
continue
mace_check(input_tensor in self._producer,
"Input tensor %s not in producer" % input_tensor)
father_op = self._producer[input_tensor]
temp_input_df = ConverterUtil.get_arg(
father_op, MaceKeyword.mace_data_format_str)
if temp_input_df.i != DataFormat.DF_AUTO.value:
input_df = temp_input_df.i
if input_df == DataFormat.DF_AUTO.value:
df_arg.i = input_df
# add flag to mark the ops may has data format
has_data_format_arg = op.arg.add()
has_data_format_arg.name = \
MaceKeyword.mace_has_data_format_str
has_data_format_arg.i = 1
return False
def transpose_data_format(self):
print("Transpose arguments based on data format")
net = self._model
src_data_format = ConverterUtil.data_format(net)
for op in net.op:
has_data_format = ConverterUtil.data_format(op) == \
DataFormat.DF_AUTO
# transpose args
if op.type == MaceOp.Pad.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_paddings_str:
mace_check(len(arg.ints) == 8,
"pad dim rank should be 8.")
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose pad args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(arg.ints,
[0, 1, 4, 5, 6, 7, 2, 3])
elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if (src_data_format == DataFormat.NCHW
and has_data_format
and len(op.output_shape[0].dims) == 4):
print("Transpose concat/split args: %s(%s)"
% (op.name, op.type))
if arg.i == 1:
arg.i = 3
elif arg.i == 2:
arg.i = 1
elif arg.i == 3:
arg.i = 2
producer = self._producer[op.input[0]]
input_shape = producer.output_shape[0].dims
if producer.type == MaceOp.FullyConnected.name and \
len(input_shape) == 2:
axis_arg = ConverterUtil.get_arg(
op, MaceKeyword.mace_axis_str)
if axis_arg.i == 1:
axis_arg.i = 3
elif op.type == MaceOp.Reduce.name:
for arg in op.arg:
if arg.name == MaceKeyword.mace_axis_str:
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose reduce args: %s(%s)"
% (op.name, op.type))
reduce_axises = list(arg.ints)
new_axises = []
for i in range(len(reduce_axises)):
idx = reduce_axises[i]
if idx == 2 or idx == 3:
new_axises.append(idx - 1)
elif idx == 1:
new_axises.append(3)
else:
new_axises.append(idx)
new_axises.sort()
arg.ints[:] = []
arg.ints.extend(new_axises)
elif op.type == MaceOp.Crop.name:
offset_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_offset_str)
mace_check(offset_arg and
src_data_format == DataFormat.NCHW
and has_data_format
and len(op.output_shape[0].dims) == 4,
"MACE only support crop with NCHW format")
print("Transpose crop args: %s(%s)"
% (op.name, op.type))
self.transpose_shape(offset_arg.ints, [0, 2, 3, 1])
# transpose op output shape
if src_data_format == DataFormat.NCHW and \
has_data_format:
print("Transpose output shapes: %s(%s)" % (op.name, op.type))
for output_shape in op.output_shape:
if len(output_shape.dims) == 4:
self.transpose_shape(output_shape.dims,
[0, 2, 3, 1])
return False
def quantize_nodes(self):
......@@ -1493,7 +1511,7 @@ class Transformer(base_converter.ConverterInterface):
self._model.input_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC)
ConverterUtil.add_data_format_arg(op_def, input_node.data_format)
# use actual ranges for model input quantize
find_range_every_time_arg = op_def.arg.add()
find_range_every_time_arg.name = \
......@@ -1516,6 +1534,7 @@ class Transformer(base_converter.ConverterInterface):
self._model.output_info[i].zero_point = quantize_info.zero_point
ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8)
ConverterUtil.add_data_format_arg(op_def, output_node.data_format)
quantize_flag_arg = self._model.arg.add()
quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str
......@@ -1886,9 +1905,6 @@ class Transformer(base_converter.ConverterInterface):
shape_tensor.data_type = mace_pb2.DT_INT32
else:
mace_check(False, "Only support reshape and flatten")
# NCHW -> NHWC
if len(dims) == 4:
self.transpose_shape(dims, [0, 2, 3, 1])
shape_tensor.int32_data.extend(dims)
op.input.append(shape_tensor.name)
......@@ -2030,6 +2046,9 @@ class Transformer(base_converter.ConverterInterface):
data_type_arg = quantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = mace_pb2.DT_UINT8
ConverterUtil.add_data_format_arg(
quantize_op,
self.get_tensor_data_format(input_tensor))
data_type_arg = quantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_non_zero
......@@ -2050,8 +2069,8 @@ class Transformer(base_converter.ConverterInterface):
del op.input[:]
op.input.extend(quantized_inputs_names)
orginal_output_name = op.output[0]
op.output[0] = orginal_output_name + "_quant"
original_output_name = op.output[0]
op.output[0] = original_output_name + "_quant"
op.output_type.extend([to_quantize_ops_output_type[op.type]])
data_type_arg = ConverterUtil.get_arg(op,
MaceKeyword.mace_op_data_type_str) # noqa
......@@ -2064,13 +2083,15 @@ class Transformer(base_converter.ConverterInterface):
dequantize_op.name = op.name + "_dequant"
dequantize_op.type = MaceOp.Dequantize.name
dequantize_op.input.extend([op.output[0]])
dequantize_op.output.extend([orginal_output_name])
dequantize_op.output.extend([original_output_name])
dequantize_op.output_shape.extend(op.output_shape)
dequantize_op.output_type.extend([mace_pb2.DT_FLOAT])
data_type_arg = dequantize_op.arg.add()
data_type_arg.name = MaceKeyword.mace_op_data_type_str
data_type_arg.i = to_quantize_ops_output_type[op.type]
ConverterUtil.add_data_format_arg(
dequantize_op,
self.get_tensor_data_format(original_output_name))
quantize_flag_arg = ConverterUtil.get_arg(self._model,
MaceKeyword.mace_quantize_flag_arg_str) # noqa
if quantize_flag_arg is None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册