提交 50cf1737 编写于 作者: 李寅

Merge branch 'refactor-data-format' into 'master'

Refactor data format

See merge request !1069
......@@ -83,7 +83,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
} else if (data_format_str == "OIHW") {
return DataFormat::OIHW;
} else {
return DataFormat::DF_NONE;
return DataFormat::NONE;
}
}
......
......@@ -96,6 +96,43 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, T, fieldname) \
template<> \
void SetProtoArg<T>(Def *def, \
const std::string &arg_name, \
const T &value) { \
int size = def->arg_size(); \
for (int i = 0; i < size; ++i) { \
auto arg = def->mutable_arg(i); \
if (arg->name() == arg_name) { \
VLOG(3) << "Update old argument value from " \
<< arg->fieldname() << " to " \
<< value << " for " << arg_name; \
arg->set_##fieldname(value); \
return; \
} \
} \
VLOG(3) << "Add new argument " << arg_name << "(name: " \
<< arg_name << ", value: " << value << ")"; \
auto arg = def->add_arg(); \
arg->set_name(arg_name); \
arg->set_##fieldname(value); \
}
#define MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(Def) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, float, f) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, bool, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int, i) \
MACE_SET_OPTIONAL_ARGUMENT_FUNC(Def, int64_t, i)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(OperatorDef)
MACE_SET_OPTIONAL_ARGUMENT_FUNC_MACRO(NetDef)
#undef MACE_SET_OPTIONAL_ARGUMENT_FUNC
const std::string OutputMemoryTypeTagName() {
static const char *kOutputMemTypeArgName = "output_mem_type";
return kOutputMemTypeArgName;
}
bool IsQuantizedModel(const NetDef &net_def) {
return
......
......@@ -55,6 +55,18 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_;
};
template <typename T>
void SetProtoArg(OperatorDef *op_def,
const std::string &arg_name,
const T&value);
template <typename T>
void SetProtoArg(NetDef *op_def,
const std::string &arg_name,
const T&value);
const std::string OutputMemoryTypeTagName();
bool IsQuantizedModel(const NetDef &def);
} // namespace mace
......
......@@ -33,7 +33,7 @@ namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze"
"Reshape", "Identity", "Squeeze", "ExpandDims"
};
return kReuseOp.count(op_type) == 1;
}
......@@ -124,8 +124,10 @@ void MemoryOptimizer::Optimize(
op_def->output_type_size());
DataType dt;
bool has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "has_data_format", 0) != 0;
DataFormat data_format = static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "data_format",
static_cast<int>(DataFormat::NONE)));
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) {
......@@ -209,7 +211,7 @@ void MemoryOptimizer::Optimize(
mem_ref_count_[best_mem_id] = 1;
}
tensor_mem_map_.emplace(op_def->output(i), TensorMemInfo(best_mem_id,
dt, has_data_format));
dt, data_format));
}
}
......
......@@ -22,6 +22,7 @@
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/types.h"
namespace mace {
......@@ -81,10 +82,10 @@ class MemoryOptimizer {
struct TensorMemInfo {
int mem_id;
DataType data_type;
bool has_data_format;
DataFormat data_format;
TensorMemInfo(int mem_id, DataType data_type, bool has_data_format) :
mem_id(mem_id), data_type(data_type), has_data_format(has_data_format)
TensorMemInfo(int mem_id, DataType data_type, DataFormat data_format) :
mem_id(mem_id), data_type(data_type), data_format(data_format)
{}
};
......
......@@ -31,99 +31,8 @@
#include "mace/utils/memory.h"
#include "mace/utils/timer.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
bool TransformRequiredOp(const std::string &op_type) {
static const std::unordered_set<std::string> kNoTransformOp = {
"Shape", "InferConv2dShape"
};
return kNoTransformOp.count(op_type) == 0;
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_.get());
construct_context->set_operator_def(op_def);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
// Get available devices
auto available_devices =
op_registry->AvailableDevices(op_def->type(), construct_context);
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (has_data_format && op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
return op_registry->CreateOperation(construct_context, device_type);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def,
Workspace *ws,
......@@ -138,237 +47,47 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(),
&target_device->cpu_runtime()->thread_pool())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
// Tensor Shape map
std::unordered_map<std::string, std::vector<index_t>> tensor_shape_map;
for (auto &op : net_def->op()) {
if (op.output_size() != op.output_shape_size()) {
continue;
}
for (int i = 0; i < op.output_size(); ++i) {
tensor_shape_map[op.output(i)] = std::vector<index_t>(
op.output_shape(i).dims().begin(),
op.output_shape(i).dims().end());
}
}
for (auto &tensor : net_def->tensors()) {
tensor_shape_map[tensor.name()] =
std::vector<index_t>(tensor.dims().begin(), tensor.dims().end());
}
bool has_data_format = false;
if (target_device_->device_type() == DeviceType::CPU) {
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
// Only could be NONE or NHWC
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = has_data_format ||
(input_data_format != DataFormat::DF_NONE);
if (!is_quantize_model && input_data_format == DataFormat::NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
}
}
}
#ifdef MACE_ENABLE_OPENCL
// output tensor : related information
std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_set<std::string> transformed_set;
// add input information
MemoryType target_mem_type;
// default data format of output tensor
DataFormat default_output_df = DataFormat::DF_NONE;
if (target_device_->device_type() == DeviceType::GPU) {
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
DataFormat input_data_format = static_cast<DataFormat>(
input_info.data_format());
has_data_format = input_data_format != DataFormat::DF_NONE;
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// update tensor shape map
tensor_shape_map[input_info.name()] = input_shape;
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_data_format,
input_shape, -1));
}
default_output_df =
has_data_format ? DataFormat::NHWC : DataFormat::DF_NONE;
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_, &tensor_shape_map);
OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) {
std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create operation
auto op = CreateOperation(op_registry,
&construct_context,
op_def,
has_data_format,
is_quantize_model);
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
// the outputs' memory type of the operation
MemoryType out_mem_type = construct_context.output_mem_type();
int input_size = op_def->input_size();
// if op is memory-unused op, no transformation
if (TransformRequiredOp(op_def->type())) {
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether to do transform
MemoryType wanted_in_mem_type =
construct_context.GetInputMemType(i);
DataType wanted_in_dt = construct_context.GetInputDataType(i);
if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
|| output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
auto t_input_name = TransformedName(op_def->input(i),
wanted_in_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_set.count(t_input_name) == 0) {
VLOG(1) << "Add Transform operation " << op_def->name()
<< " to transform tensor "
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to "
<< wanted_in_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< wanted_in_dt << ". with data format "
<< output_info.data_format;
std::string input_name = op_def->input(i);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
output_info.data_format == DataFormat::NCHW &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name, wanted_in_dt,
construct_context.GetInputOpenCLBufferType(i),
wanted_in_mem_type, has_data_format);
OpConstructContext t_construct_context(ws_);
auto transform_op = CreateOperation(
op_registry,
&t_construct_context,
transform_op_def,
has_data_format);
operators_.emplace_back(std::move(transform_op));
transformed_set.insert(t_input_name);
output_mem_map[t_input_name] = wanted_in_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, t_input_name);
}
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
}
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
DataType dt;
if (op_def->output_type_size() == op_def->output_size()) {
dt = op_def->output_type(out_idx);
} else {
dt = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
}
output_mem_map[op_def->output(out_idx)] = out_mem_type;
output_map.emplace(
op_def->output(out_idx),
InternalOutputInfo(
out_mem_type,
dt,
default_output_df,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
}
auto op_device_type = static_cast<DeviceType>(op_def->device_type());
if (op_device_type == target_device_->device_type()) {
construct_context.set_device(target_device_);
} else if (op_device_type == DeviceType::CPU) {
construct_context.set_device(cpu_device_.get());
} else {
LOG(FATAL) << "Encounter unexpected error: "
<< op_device_type << " vs " << target_device_->device_type();
}
#endif // MACE_ENABLE_OPENCL
construct_context.set_operator_def(op_def);
auto op = op_registry->CreateOperation(&construct_context,
op_device_type);
operators_.emplace_back(std::move(op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) {
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != output_info.data_type()) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << output_info.data_type();
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
bool output_has_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
output_info.data_type(),
OpenCLBufferType::IN_OUT_CHANNEL,
target_mem_type,
output_has_data_format);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_has_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
if (target_device_->device_type() == DeviceType::GPU) {
// update the map : output_tensor -> MemoryType
MemoryType out_mem_type =
static_cast<MemoryType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
net_def->op(idx), OutputMemoryTypeTagName(),
static_cast<int>(MemoryType::CPU_BUFFER)));
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
output_mem_map[op_def->output(out_idx)] = out_mem_type;
}
}
}
#endif // MACE_ENABLE_OPENCL
}
// Update output tensor reference
for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name());
......
......@@ -54,14 +54,6 @@ class SerialNet : public NetBase {
MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
bool has_data_format,
bool is_quantize_model = false);
protected:
Workspace *ws_;
Device *target_device_;
......
此差异已折叠。
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_DEF_ADAPTER_H_
#define MACE_CORE_NET_DEF_ADAPTER_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "mace/core/types.h"
#include "mace/proto/mace.pb.h"
#include "mace/port/port.h"
#include "mace/core/operator.h"
#include "mace/core/net_optimizer.h"
namespace mace {
class OpRegistryBase;
class Workspace;
class Device;
/// Conventions:
/// 1. DataFormat::AUTO stands for formatted (NHWC or NCHW)
/// 2. if Op with DataFormat::AUTO, the arguments of this op
/// is formatted to NHWC
class NetDefAdapter {
public:
NetDefAdapter(const OpRegistryBase *op_registry,
const Workspace *ws);
// Adapt original net_def to a better net.
// 1. Adapt device: choose best device for every op in the net.
// 2. Adapt data type: Add data type related transform ops
// for mixing precision.
// 3. Adapt data format: confirm data format of every op
// and add transpose if necessary.
// 4. Adapt memory type: Add BufferTransform if necessary
// for transforming memory type between ops.
MaceStatus AdaptNetDef(
const NetDef *net_def,
Device *target_device,
NetDef *target_net_def);
public:
NetDefAdapter(const NetDefAdapter&) = delete;
NetDefAdapter(const NetDefAdapter&&) = delete;
NetDefAdapter &operator=(const NetDefAdapter &) = delete;
NetDefAdapter &operator=(const NetDefAdapter &&) = delete;
private:
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const DataFormat data_format,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), data_format(data_format),
shape(shape), op_idx(op_idx) {}
MemoryType mem_type;
DataType dtype;
DataFormat data_format;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
typedef std::unordered_map<std::string, InternalOutputInfo> TensorInfoMap;
private:
MaceStatus AdaptDevice(OpConditionContext *context,
Device *target_device,
Device *cpu_device,
const TensorInfoMap &output_map,
const NetDef *net_def,
OperatorDef *op);
MaceStatus AdaptDataType(OpConditionContext *context,
OperatorDef *op);
MaceStatus AdaptDataFormat(
OpConditionContext *context,
OperatorDef *op,
bool is_quantized_model,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
DataFormat *op_output_df,
NetDef *target_net_def);
MaceStatus AdaptMemoryType(
OpConditionContext *context,
OperatorDef *op_def,
TensorInfoMap *output_map,
std::unordered_set<std::string> *transformed_set,
MemoryType *op_output_mem_types,
NetDef *target_net_def);
std::string DebugString(const NetDef *net_def);
private:
const OpRegistryBase *op_registry_;
const Workspace *ws_;
NetOptimizer net_optimizer_;
};
} // namespace mace
#endif // MACE_CORE_NET_DEF_ADAPTER_H_
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/net_optimizer.h"
#include <string>
namespace mace {
DeviceType NetOptimizer::SelectBestDevice(
const OperatorDef *op_def,
DeviceType target_device_type,
const std::set<DeviceType> &available_devices,
const std::vector<DeviceType> &inputs_op_devices) {
static const std::set<std::string> kComputeIntensiveOps = {
"Conv2D", "DepthwiseConv2d", "Deconv2D", "DepthwiseDeconv2d",
"FullyConnected"
};
// CPU is the device to fall back
DeviceType best_device = DeviceType::CPU;
if (available_devices.count(target_device_type) == 1) {
best_device = target_device_type;
}
if (best_device == DeviceType::CPU) {
return best_device;
}
// Put compute-intensive ops in target device
if (kComputeIntensiveOps.count(op_def->type()) == 1) {
return best_device;
}
// Greedy strategy: Use input op's device type as current op's device
for (auto device_type : inputs_op_devices) {
if (device_type != best_device) {
best_device = device_type;
}
}
return best_device;
}
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_NET_OPTIMIZER_H_
#define MACE_CORE_NET_OPTIMIZER_H_
#include <set>
#include <vector>
#include "mace/port/port.h"
#include "mace/proto/mace.pb.h"
namespace mace {
/// Any optimization for Net could be put in here in the future.
class NetOptimizer {
public:
/// Select best device for the op to support mixing usage of CPU and GPU.
/// Greedy strategy: one way to the end. If the op fallback to CPU, then
/// the follow-up ops will run on CPU too util meet
/// some compute-intensive ops(Convolution) to
/// reduce the memory copy between CPU and GPU.
/// Simple but effective.
///
/// \param op_def the op
/// \param target_device target device to run on
/// \param available_devices available devices of the op
/// \param inputs_op_devices devices of father ops run on
/// \return Best device for the op_def
DeviceType SelectBestDevice(const OperatorDef *op_def,
DeviceType target_device,
const std::set<DeviceType> &available_devices,
const std::vector<DeviceType> &inputs_op_devices);
};
} // namespace mace
#endif // MACE_CORE_NET_OPTIMIZER_H_
......@@ -20,36 +20,23 @@
#include "mace/core/operator.h"
namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr),
tensor_shape_info_(nullptr) {}
OpConstructContext::OpConstructContext(
mace::Workspace *ws,
mace::OpConstructContext::TensorShapeMap *info)
OpConditionContext::OpConditionContext(
const Workspace *ws,
OpConditionContext::TensorShapeMap *info)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr),
tensor_shape_info_(info) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<mace::OperatorDef> operator_def) {
void OpConditionContext::set_operator_def(
const OperatorDef *operator_def) {
operator_def_ = operator_def;
input_data_types_.clear();
}
void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
void OpConstructContext::SetInputInfo(size_t idx,
mace::MemoryType mem_type,
mace::DataType dt) {
void OpConditionContext::SetInputInfo(size_t idx,
MemoryType mem_type,
DataType dt) {
if (input_mem_types_.empty()) {
// the default inputs' memory types are same as output memory type.
input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
......@@ -66,7 +53,13 @@ void OpConstructContext::SetInputInfo(size_t idx,
input_data_types_[idx] = dt;
}
MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
void OpConditionContext::set_output_mem_type(MemoryType type) {
MACE_CHECK(operator_def_ != nullptr);
output_mem_type_ = type;
input_mem_types_.clear();
}
MemoryType OpConditionContext::GetInputMemType(size_t idx) const {
if (input_mem_types_.empty()) {
return output_mem_type_;
}
......@@ -75,7 +68,7 @@ MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
return input_mem_types_[idx];
}
DataType OpConstructContext::GetInputDataType(size_t idx) const {
DataType OpConditionContext::GetInputDataType(size_t idx) const {
if (input_data_types_.empty()) {
// the default inputs' data types are same as operation's data type.
return static_cast<DataType>(
......@@ -87,17 +80,17 @@ DataType OpConstructContext::GetInputDataType(size_t idx) const {
}
#ifdef MACE_ENABLE_OPENCL
void OpConstructContext::SetInputOpenCLBufferType(
void OpConditionContext::SetInputOpenCLBufferType(
size_t idx, OpenCLBufferType buffer_type) {
if (input_opencl_buffer_types_.empty()) {
// the default inputs' memory types are same as output memory type.
input_opencl_buffer_types_.resize(operator_def_->input_size(),
OpenCLBufferType::IN_OUT_CHANNEL);
OpenCLBufferType::IN_OUT_CHANNEL);
}
MACE_CHECK(idx < input_opencl_buffer_types_.size());
input_opencl_buffer_types_[idx] = buffer_type;
}
OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
OpenCLBufferType OpConditionContext::GetInputOpenCLBufferType(
size_t idx) const {
if (input_opencl_buffer_types_.empty()) {
return OpenCLBufferType::IN_OUT_CHANNEL;
......@@ -107,6 +100,16 @@ OpenCLBufferType OpConstructContext::GetInputOpenCLBufferType(
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr),
ws_(ws),
device_(nullptr) {}
void OpConstructContext::set_operator_def(
std::shared_ptr<OperatorDef> operator_def) {
operator_def_ = operator_def;
}
OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {}
......@@ -202,19 +205,40 @@ const std::string OpKeyBuilder::Build() {
} // namespace
OpRegistrationInfo::OpRegistrationInfo() {
device_placer = [this](OpConstructContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
// The GPU ops only support 4D In/Out tensor by default
if (this->devices.count(DeviceType::CPU) == 1 &&
op->output_shape_size() == op->output_size() &&
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
// default device type placer
device_placer = [this](OpConditionContext *context) -> std::set<DeviceType> {
MACE_UNUSED(context);
return this->devices;
};
// default input and output memory type setter
memory_type_setter = [](OpConditionContext *context) -> void {
if (context->device()->device_type() == DeviceType::GPU) {
#ifdef MACE_ENABLE_OPENCL
if (context->device()->gpu_runtime()->UseImageMemory()) {
context->set_output_mem_type(MemoryType::GPU_IMAGE);
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
#endif // MACE_ENABLE_OPENCL
} else {
context->set_output_mem_type(MemoryType::CPU_BUFFER);
}
};
data_format_selector = [](OpConditionContext *context)
-> std::vector<DataFormat> {
DataFormat op_data_format =
static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
return std::vector<DataFormat>(context->operator_def()->input_size(),
op_data_format);
};
}
void OpRegistrationInfo::AddDevice(mace::DeviceType device) {
void OpRegistrationInfo::AddDevice(DeviceType device) {
devices.insert(device);
}
......@@ -226,9 +250,9 @@ void OpRegistrationInfo::Register(const std::string &key, OpCreator creator) {
MaceStatus OpRegistryBase::Register(
const std::string &op_type,
const mace::DeviceType device_type,
const mace::DataType dt,
mace::OpRegistrationInfo::OpCreator creator) {
const DeviceType device_type,
const DataType dt,
OpRegistrationInfo::OpCreator creator) {
if (registry_.count(op_type) == 0) {
registry_[op_type] = std::unique_ptr<OpRegistrationInfo>(
new OpRegistrationInfo);
......@@ -255,13 +279,29 @@ MaceStatus OpRegistryBase::Register(
}
const std::set<DeviceType> OpRegistryBase::AvailableDevices(
const std::string &op_type, OpConstructContext *context) const {
const std::string &op_type, OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->device_placer(context);
}
void OpRegistryBase::GetInOutMemoryTypes(
const std::string &op_type,
OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->memory_type_setter(context);
}
const std::vector<DataFormat> OpRegistryBase::InputsDataFormat(
const std::string &op_type,
OpConditionContext *context) const {
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
return registry_.at(op_type)->data_format_selector(context);
}
std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context,
DeviceType device_type) const {
......@@ -269,15 +309,6 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT)));
if (device_type == DeviceType::CPU && dtype == DT_HALF) {
int arg_size = operator_def->arg_size();
for (int i = 0; i < arg_size; ++i) {
if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type;
......@@ -308,9 +339,30 @@ OpConditionBuilder &OpConditionBuilder::SetDevicePlacerFunc(
return *this;
}
OpConditionBuilder& OpConditionBuilder::SetInputMemoryTypeSetter(
OpRegistrationInfo::MemoryTypeSetter setter) {
memory_type_setter_ = setter;
return *this;
}
OpConditionBuilder& OpConditionBuilder::SetInputsDataFormatSelector(
OpRegistrationInfo::DataFormatSelector selector) {
data_format_selector_ = selector;
return *this;
}
void OpConditionBuilder::Finalize(OpRegistrationInfo *info) const {
if (info != nullptr && placer_) {
info->device_placer = placer_;
if (info != nullptr) {
if (placer_) {
info->device_placer = placer_;
}
if (memory_type_setter_) {
info->memory_type_setter = memory_type_setter_;
}
if (data_format_selector_) {
info->data_format_selector = data_format_selector_;
}
}
}
......
......@@ -32,22 +32,20 @@
namespace mace {
// memory_optimizer, device
class OpConstructContext {
// OpConditionContext has all information used for choosing proper Op
class OpConditionContext {
public:
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
OpConditionContext(const Workspace *ws, TensorShapeMap *info);
~OpConditionContext() = default;
public:
explicit OpConstructContext(Workspace *ws);
OpConstructContext(Workspace *ws, TensorShapeMap *info);
~OpConstructContext() = default;
void set_operator_def(const OperatorDef* operator_def);
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
inline const OperatorDef *operator_def() const {
return operator_def_;
}
inline Workspace *workspace() const {
inline const Workspace *workspace() const {
return ws_;
}
......@@ -81,8 +79,8 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
const OperatorDef *operator_def_;
const Workspace *ws_;
Device *device_;
TensorShapeMap *tensor_shape_info_;
// used for memory transform
......@@ -94,6 +92,46 @@ class OpConstructContext {
#endif // MACE_ENABLE_OPENCL
};
// memory_optimizer, device
class OpConstructContext {
typedef std::unordered_map<std::string, std::vector<index_t>> TensorShapeMap;
public:
explicit OpConstructContext(Workspace *ws);
~OpConstructContext() = default;
void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_;
}
inline Workspace *workspace() const {
return ws_;
}
inline void set_device(Device* device) {
device_ = device;
}
inline Device *device() const {
return device_;
}
#ifdef MACE_ENABLE_OPENCL
inline MemoryType GetOpMemoryType() const {
return static_cast<MemoryType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def_, OutputMemoryTypeTagName(),
static_cast<int>(MemoryType::CPU_BUFFER)));
}
#endif // MACE_ENABLE_OPENCL
private:
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
Device *device_;
};
// memory_optimizer, device
class OpInitContext {
public:
......@@ -207,8 +245,11 @@ struct OpRegistrationInfo {
public:
typedef std::function<std::unique_ptr<Operation>(OpConstructContext *)>
OpCreator;
typedef std::function<std::set<DeviceType>(OpConstructContext *)>
typedef std::function<std::set<DeviceType>(OpConditionContext *)>
DevicePlacer;
typedef std::function<void(OpConditionContext *)> MemoryTypeSetter;
typedef std::function<std::vector<DataFormat>(OpConditionContext *)>
DataFormatSelector;
OpRegistrationInfo();
......@@ -219,6 +260,8 @@ struct OpRegistrationInfo {
std::set<DeviceType> devices;
std::unordered_map<std::string, OpCreator> creators;
DevicePlacer device_placer;
MemoryTypeSetter memory_type_setter;
DataFormatSelector data_format_selector;
};
class OpConditionBuilder {
......@@ -230,11 +273,21 @@ class OpConditionBuilder {
OpConditionBuilder &SetDevicePlacerFunc(
OpRegistrationInfo::DevicePlacer placer);
// If you set input memory type for specified Op,
// you must call OpConditionContext::set_output_mem_type
OpConditionBuilder &SetInputMemoryTypeSetter(
OpRegistrationInfo::MemoryTypeSetter setter);
OpConditionBuilder &SetInputsDataFormatSelector(
OpRegistrationInfo::DataFormatSelector selector);
void Finalize(OpRegistrationInfo *info) const;
private:
std::string type_;
OpRegistrationInfo::DevicePlacer placer_;
OpRegistrationInfo::MemoryTypeSetter memory_type_setter_;
OpRegistrationInfo::DataFormatSelector data_format_selector_;
};
......@@ -250,7 +303,13 @@ class OpRegistryBase {
MaceStatus Register(const OpConditionBuilder &builder);
const std::set<DeviceType> AvailableDevices(
const std::string &op_type, OpConstructContext *context) const;
const std::string &op_type, OpConditionContext *context) const;
void GetInOutMemoryTypes(
const std::string &op_type, OpConditionContext *context) const;
const std::vector<DataFormat> InputsDataFormat(
const std::string &op_type, OpConditionContext *context) const;
std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context,
......
......@@ -147,38 +147,38 @@ void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
}
}
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
void OpenCLUtil::BuildTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const OpenCLBufferType buffer_type,
const mace::MemoryType mem_type,
bool has_data_format) {
std::unique_ptr<OperatorDef> op(new OperatorDef);
DataFormat data_format,
OperatorDef *op_def) {
std::string op_name = "mace_node_" + output_name;
op->set_name(op_name);
op->set_type("BufferTransform");
op->add_input(input_name);
op->add_output(output_name);
Argument *arg = op->add_arg();
op_def->set_name(op_name);
op_def->set_type("BufferTransform");
op_def->add_input(input_name);
op_def->add_output(output_name);
op_def->set_device_type(DeviceType::GPU);
Argument *arg = op_def->add_arg();
arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(buffer_type));
arg = op->add_arg();
arg = op_def->add_arg();
arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg();
arg = op_def->add_arg();
arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg();
arg->set_name("has_data_format");
arg->set_i(has_data_format);
arg = op_def->add_arg();
arg->set_name("data_format");
arg->set_i(static_cast<int>(data_format));
if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape();
OutputShape *shape = op_def->add_output_shape();
for (auto value : input_shape) {
shape->add_dims(value);
}
}
return std::move(op);
}
} // namespace mace
......@@ -43,14 +43,15 @@ class OpenCLUtil {
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef(
static void BuildTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const OpenCLBufferType buffer_type,
const MemoryType mem_type,
bool has_data_format);
DataFormat data_format,
OperatorDef *op_def);
};
} // namespace mace
......
......@@ -263,13 +263,13 @@ MaceStatus Workspace::PreallocateOutputTensor(
}
}
VLOG(1) << "Preallocate buffer to tensors";
bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.mem_id),
tensor_mem.second.data_type,
false, tensor_mem.first));
if (tensor_mem.second.has_data_format) {
tensor->set_data_format(tensor_mem.second.data_format);
if (tensor_mem.second.data_format != DataFormat::NONE) {
if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first
......@@ -279,22 +279,12 @@ MaceStatus Workspace::PreallocateOutputTensor(
<< tensor->UnderlyingBuffer()->shape()[0]
<< ", "
<< tensor->UnderlyingBuffer()->shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.mem_id
<< " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
if (mem_blocks[tensor_mem.second.mem_id].mem_type()
== MemoryType::GPU_BUFFER ||
is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else {
tensor->set_data_format(DataFormat::NCHW);
}
}
} else {
tensor->set_data_format(DataFormat::DF_NONE);
}
tensor_map_[tensor_mem.first] = std::move(tensor);
}
......
......@@ -94,7 +94,7 @@ DataFormat ParseDataFormat(const std::string &data_format_str) {
} else if (data_format_str == "OIHW") {
return DataFormat::OIHW;
} else {
return DataFormat::DF_NONE;
return DataFormat::NONE;
}
}
......
......@@ -143,7 +143,7 @@ void BMNet::SetUp() {
// Add input and output information
for (size_t i = 0; i < input_names_.size(); ++i) {
InputOutputInfo *info = net_.add_input_info();
info->set_data_format(DataFormat::NHWC);
info->set_data_format(static_cast<int>(DataFormat::NHWC));
info->set_name(input_names_[i]);
for (auto d : input_shapes_[i]) {
info->add_dims(static_cast<int>(d));
......@@ -244,7 +244,7 @@ void BMNet::AddConv(const std::string &conv_type,
op_def->add_output(output_name);
AddIntsArg(op_def, "strides", strides);
AddIntArg(op_def, "padding", padding_type);
AddIntArg(op_def, "has_data_format", 1);
AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
AddIntArg(op_def, "T", DT_HALF);
if (has_relu6) {
AddStringArg(op_def, "activation", "RELUX");
......@@ -271,7 +271,7 @@ void BMNet::AddEltwise(const std::string &op_name,
op_def->add_output(output);
AddIntArg(op_def, "type", type);
AddIntArg(op_def, "T", DT_HALF);
AddIntArg(op_def, "has_data_format", 1);
AddIntArg(op_def, "data_format", static_cast<int>(DataFormat::AUTO));
OutputShape *shape = op_def->add_output_shape();
for (auto dim : output_shape) {
shape->add_dims(dim);
......
......@@ -27,6 +27,7 @@
#include "mace/public/mace.h"
#include "mace/port/env.h"
#include "mace/port/file_system.h"
#include "mace/core/net_def_adapter.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
......@@ -282,9 +283,9 @@ MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<void> data,
const DataFormat format) {
MACE_CHECK_NOTNULL(data.get());
MACE_CHECK(format == DataFormat::DF_NONE || format == DataFormat::NHWC
|| format == DataFormat::NCHW || format == OIHW,
"MACE only support DF_NONE, NHWC, NCHW and OIHW "
MACE_CHECK(format == DataFormat::NONE || format == DataFormat::NHWC
|| format == DataFormat::NCHW || format == DataFormat::OIHW,
"MACE only support NONE, NHWC, NCHW and OIHW "
"formats of input now.");
impl_ = make_unique<MaceTensor::Impl>();
impl_->shape = shape;
......@@ -495,7 +496,7 @@ MaceStatus MaceEngine::Impl::Init(
DataType output_dt = output_info_map_[output_name].data_type();
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), output_dt);
output_tensor->set_data_format(NHWC);
output_tensor->set_data_format(DataFormat::NHWC);
#endif
}
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
......@@ -512,26 +513,32 @@ MaceStatus MaceEngine::Impl::Init(
}
} else {
#endif
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
device_.get(),
model_data));
MemoryOptimizer mem_optimizer;
// Init model
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def,
ws_.get(),
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
&mem_optimizer,
device_.get()));
if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
}
MACE_RETURN_IF_ERROR(net_->Init());
MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(*net_def,
device_.get(),
model_data));
NetDef adapted_net_def;
NetDefAdapter net_def_adapter(op_registry_.get(), ws_.get());
net_def_adapter.AdaptNetDef(net_def, device_.get(), &adapted_net_def);
MemoryOptimizer mem_optimizer;
// Init model
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
&adapted_net_def,
ws_.get(),
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(adapted_net_def,
&mem_optimizer,
device_.get()));
if (device_type_ == DeviceType::GPU) {
ws_->RemoveAndReloadBuffer(adapted_net_def,
model_data,
device_->allocator());
}
MACE_RETURN_IF_ERROR(net_->Init());
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
}
#endif
......@@ -578,14 +585,14 @@ MaceEngine::Impl::~Impl() {
MaceStatus MaceEngine::Impl::TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor) {
bool has_data_format = input_tensor->data_format() != DataFormat::DF_NONE;
DataFormat data_format = DataFormat::DF_NONE;
bool has_data_format = input_tensor->data_format() != DataFormat::NONE;
DataFormat data_format = DataFormat::NONE;
DataType input_dt = input_tensor->dtype();
if (has_data_format) {
std::vector<int> dst_dims;
if (device_->device_type() == DeviceType::CPU &&
input.second.shape().size() == 4 &&
input.second.data_format() == NHWC &&
input.second.data_format() == DataFormat::NHWC &&
!is_quantized_model_) {
VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
input_tensor->set_data_format(DataFormat::NCHW);
......@@ -647,28 +654,28 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
DataType output_dt = output_tensor->dtype();
// save output
if (output_tensor != nullptr && output->second.data() != nullptr) {
if (output_tensor->data_format() != DataFormat::DF_NONE &&
output->second.data_format() != DataFormat::DF_NONE &&
if (output_tensor->data_format() != DataFormat::NONE &&
output->second.data_format() != DataFormat::NONE &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
VLOG(1) << "Transform output " << output->first << " from "
<< output_tensor->data_format() << " to "
<< output->second.data_format();
<< static_cast<int>(output_tensor->data_format()) << " to "
<< static_cast<int>(output->second.data_format());
std::vector<int> dst_dims;
if (output_tensor->data_format() == NCHW &&
output->second.data_format() == NHWC) {
if (output_tensor->data_format() == DataFormat::NCHW &&
output->second.data_format() == DataFormat::NHWC) {
dst_dims = {0, 2, 3, 1};
} else if (output_tensor->data_format() == NHWC &&
output->second.data_format() == NCHW) {
} else if (output_tensor->data_format() == DataFormat::NHWC &&
output->second.data_format() == DataFormat::NCHW) {
dst_dims = {0, 3, 1, 2};
} else {
LOG(FATAL) << "Not supported output data format: "
<< output->second.data_format() << " vs "
<< output_tensor->data_format();
<< static_cast<int>(output->second.data_format()) << " vs "
<< static_cast<int>(output_tensor->data_format());
}
VLOG(1) << "Transform output " << output->first << " from "
<< output_tensor->data_format() << " to "
<< output->second.data_format();
<< static_cast<int>(output_tensor->data_format()) << " to "
<< static_cast<int>(output->second.data_format());
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
......
......@@ -15,6 +15,8 @@
#include "mace/ops/activation.h"
#include <memory>
#include <set>
#include "mace/core/operator.h"
#if defined(MACE_ENABLE_NEON)
......@@ -94,7 +96,7 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
auto leakyrelu_coefficient = static_cast<T>(
Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
MemoryType mem_type;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::ActivationKernel<T>>(
type, relux_max_limit, leakyrelu_coefficient);
......@@ -132,6 +134,24 @@ void RegisterActivation(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Activation")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -207,7 +207,8 @@ void TestSimplePrelu() {
// Run
net.RunOp(D);
} else {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<D, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Activation", "PreluTest")
.Input("InputNCHW")
.Input("Alpha")
......@@ -217,7 +218,8 @@ void TestSimplePrelu() {
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
auto expected = net.CreateTensor<float>(
......
......@@ -67,7 +67,7 @@ class AddNOp<DeviceType::GPU, T> : public Operation {
public:
explicit AddNOp(OpConstructContext *context)
: Operation(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::AddNKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......@@ -101,6 +101,24 @@ void RegisterAddN(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("AddN")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -54,7 +54,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
out_pad_size,
&padded_out_shape,
framework_type_,
NCHW);
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......
......@@ -174,7 +174,7 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
float leakyrelu_coefficient = Operation::GetOptionalArg<float>(
"leakyrelu_coefficient", 0.0f);
MemoryType mem_type;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BatchNormKernel<T>>(
epsilon, activation, relux_max_limit, leakyrelu_coefficient);
......
......@@ -34,7 +34,8 @@ void Simple() {
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<D, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
.Input("Scale")
......@@ -47,7 +48,8 @@ void Simple() {
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
......@@ -93,8 +95,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("BatchNorm", "BatchNormTest")
......@@ -112,8 +114,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -163,8 +165,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
......@@ -179,8 +181,8 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -230,8 +232,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
......@@ -246,8 +248,8 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -296,8 +298,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNCHW")
......@@ -312,8 +314,8 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......
......@@ -264,7 +264,7 @@ class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
public:
explicit BatchToSpaceNDOp(OpConstructContext *context)
: BatchToSpaceOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::BatchToSpaceKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -103,7 +103,7 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
: Operation(context),
has_data_format_(Operation::GetOptionalArg<int>("has_data_format", 1)) {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::BiasAddKernel<T>>();
} else {
......@@ -145,6 +145,24 @@ void RegisterBiasAdd(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("BiasAdd")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -27,9 +27,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
OpsTestNet net;
// Add input data
DataFormat data_format = NHWC;
if (D == DeviceType::CPU) {
data_format = NCHW;
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
......
......@@ -31,8 +31,8 @@ void BiasAddSimple() {
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputNCHW")
.Input("Bias")
......@@ -41,8 +41,8 @@ void BiasAddSimple() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("Input")
......@@ -83,8 +83,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("BiasAdd", "BiasAddTest")
......@@ -97,8 +97,8 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -132,8 +132,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("BiasAdd", "BiasAddTest")
......@@ -146,8 +146,8 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......
......@@ -48,7 +48,6 @@ void FilterBufferToImage(int iters,
OpenCLBufferType::IN_OUT_CHANNEL,
MemoryType::GPU_IMAGE,
0,
DataFormat::NHWC,
b2i_output);
};
......
......@@ -37,14 +37,14 @@ void TestBidirectionTransform(const OpenCLBufferType type,
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......@@ -178,14 +178,14 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......@@ -218,14 +218,14 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
// Transform
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, DataFormat::NHWC, b2i_output);
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, i2b_output);
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......
......@@ -39,14 +39,11 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
auto type =
static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(CONV2D_FILTER)));
bool has_data_format = Operation::GetOptionalArg<int>("has_data_format", 0)
!= 0;
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_,
has_data_format, output);
context, input, type, out_mem_type_, wino_blk_size_, output);
}
private:
......
......@@ -48,7 +48,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, bt_output);
type, MemoryType::GPU_BUFFER, 0, bt_output);
// Inverse Transform
Tensor *output = net.ws()->CreateTensor(
......@@ -57,7 +57,7 @@ void TestBidirectionTransform(const OpenCLBufferType type,
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, DataFormat::NHWC, output);
type, MemoryType::GPU_BUFFER, 0, output);
if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
......@@ -94,7 +94,7 @@ void TestArgumentTransform(const index_t input_size) {
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
0, DataFormat::NHWC, output);
0, output);
index_t expected_size = RoundUp<index_t>(input_size, 4);
EXPECT_EQ(expected_size, output->buffer_shape()[0]);
......
......@@ -82,7 +82,7 @@ class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
explicit ChannelShuffleOp(OpConstructContext *context)
: Operation(context) {
const int groups = Operation::GetOptionalArg<int>("group", 1);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ChannelShuffleKernel<T>>(groups);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -116,7 +116,7 @@ void RegisterChannelShuffle(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("ChannelShuffle")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
......
......@@ -28,8 +28,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input", {1, 1, 2, 8},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
......@@ -40,8 +40,8 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>(
......
......@@ -40,19 +40,19 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
index_t input_height = 0, input_width = 0;
index_t kernel_height = 0, kernel_width = 0;
if (input_format == NCHW) {
if (input_format == DataFormat::NCHW) {
input_height = input_shape[2];
input_width = input_shape[3];
} else if (input_format == NHWC) {
} else if (input_format == DataFormat::NHWC) {
input_height = input_shape[1];
input_width = input_shape[2];
} else {
MACE_NOT_IMPLEMENTED;
}
if (filter_format == OIHW) {
if (filter_format == DataFormat::OIHW) {
kernel_height = filter_shape[2];
kernel_width = filter_shape[3];
} else if (filter_format == OHWI) {
} else if (filter_format == DataFormat::OHWI) {
kernel_height = filter_shape[1];
kernel_width = filter_shape[2];
} else {
......@@ -97,11 +97,11 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
0, (output_width - 1) * strides[1] + k_extent_width - input_width);
output_shape[0] = input_shape[0];
if (input_format == NCHW) {
if (input_format == DataFormat::NCHW) {
output_shape[1] = output_channels;
output_shape[2] = output_height;
output_shape[3] = output_width;
} else if (input_format == NHWC) {
} else if (input_format == DataFormat::NHWC) {
output_shape[1] = output_height;
output_shape[2] = output_width;
output_shape[3] = output_channels;
......@@ -117,7 +117,8 @@ void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW
Padding padding,
index_t *output_shape,
int *padding_size) {
CalcPaddingAndOutputSize(input_shape, NCHW, filter_shape, OIHW, dilations,
CalcPaddingAndOutputSize(input_shape, DataFormat::NCHW, filter_shape,
DataFormat::OIHW, dilations,
strides, padding, output_shape, padding_size);
}
......@@ -128,7 +129,8 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
Padding padding,
index_t *output_shape,
int *padding_size) {
CalcPaddingAndOutputSize(input_shape, NHWC, filter_shape, OIHW, dilations,
CalcPaddingAndOutputSize(input_shape, DataFormat::NHWC, filter_shape,
DataFormat::OIHW, dilations,
strides, padding, output_shape, padding_size);
}
......@@ -151,19 +153,19 @@ void CalcOutputSize(const index_t *input_shape,
index_t input_height = 0, input_width = 0;
index_t kernel_height = 0, kernel_width = 0;
if (input_format == NCHW) {
if (input_format == DataFormat::NCHW) {
input_height = input_shape[2];
input_width = input_shape[3];
} else if (input_format == NHWC) {
} else if (input_format == DataFormat::NHWC) {
input_height = input_shape[1];
input_width = input_shape[2];
} else {
MACE_NOT_IMPLEMENTED;
}
if (filter_format == OIHW) {
if (filter_format == DataFormat::OIHW) {
kernel_height = filter_shape[2];
kernel_width = filter_shape[3];
} else if (filter_format == OHWI) {
} else if (filter_format == DataFormat::OHWI) {
kernel_height = filter_shape[1];
kernel_width = filter_shape[2];
} else {
......@@ -195,11 +197,11 @@ void CalcOutputSize(const index_t *input_shape,
}
output_shape[0] = input_shape[0];
if (input_format == NCHW) {
if (input_format == DataFormat::NCHW) {
output_shape[1] = output_channels;
output_shape[2] = output_height;
output_shape[3] = output_width;
} else if (input_format == NHWC) {
} else if (input_format == DataFormat::NHWC) {
output_shape[1] = output_height;
output_shape[2] = output_width;
output_shape[3] = output_channels;
......@@ -215,7 +217,8 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
const int *strides,
const RoundType round_type,
index_t *output_shape) {
CalcOutputSize(input_shape, NHWC, filter_shape, OIHW, padding_size, dilations,
CalcOutputSize(input_shape, DataFormat::NHWC, filter_shape,
DataFormat::OIHW, padding_size, dilations,
strides, round_type, output_shape);
}
......@@ -226,7 +229,8 @@ void CalcNCHWOutputSize(const index_t *input_shape, // NCHW
const int *strides,
const RoundType round_type,
index_t *output_shape) {
CalcOutputSize(input_shape, NCHW, filter_shape, OIHW, padding_size, dilations,
CalcOutputSize(input_shape, DataFormat::NCHW, filter_shape,
DataFormat::OIHW, padding_size, dilations,
strides, round_type, output_shape);
}
......@@ -241,14 +245,18 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
std::vector<index_t> *padded_out_shape,
DataFormat data_format) {
const index_t
in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
in_height =
data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
const index_t
in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
in_width =
data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
const index_t
out_height = data_format == NCHW ? output_shape[2] : output_shape[1];
out_height =
data_format == DataFormat::NCHW ? output_shape[2] : output_shape[1];
const index_t
out_width = data_format == NCHW ? output_shape[3] : output_shape[2];
out_width =
data_format == DataFormat::NCHW ? output_shape[3] : output_shape[2];
const index_t extended_in_height = (in_height - 1) * strides[0] + 1;
const index_t extended_in_width = (in_width - 1) * strides[1] + 1;
......@@ -307,11 +315,11 @@ void CalcDeconvShape_TF(const std::vector<index_t> &input_shape,
padded_out_shape->resize(4);
(*padded_out_shape)[0] = output_shape[0];
(*padded_out_shape)[1] =
data_format == NCHW ? output_channel : padded_out_height;
data_format == DataFormat::NCHW ? output_channel : padded_out_height;
(*padded_out_shape)[2] =
data_format == NCHW ? padded_out_height : padded_out_width;
data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
(*padded_out_shape)[3] =
data_format == NCHW ? padded_out_width : output_channel;
data_format == DataFormat::NCHW ? padded_out_width : output_channel;
}
}
......@@ -325,9 +333,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
std::vector<index_t> *padded_out_shape,
DataFormat data_format) {
const index_t
in_height = data_format == NCHW ? input_shape[2] : input_shape[1];
in_height =
data_format == DataFormat::NCHW ? input_shape[2] : input_shape[1];
const index_t
in_width = data_format == NCHW ? input_shape[3] : input_shape[2];
in_width =
data_format == DataFormat::NCHW ? input_shape[3] : input_shape[2];
const index_t output_channel = filter_shape[0] * group;
......@@ -351,11 +361,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
padded_out_shape->resize(4);
(*padded_out_shape)[0] = input_shape[0];
(*padded_out_shape)[1] =
data_format == NCHW ? output_channel : padded_out_height;
data_format == DataFormat::NCHW ? output_channel : padded_out_height;
(*padded_out_shape)[2] =
data_format == NCHW ? padded_out_height : padded_out_width;
data_format == DataFormat::NCHW ? padded_out_height : padded_out_width;
(*padded_out_shape)[3] =
data_format == NCHW ? padded_out_width : output_channel;
data_format == DataFormat::NCHW ? padded_out_width : output_channel;
}
if (out_shape != nullptr) {
......@@ -363,9 +373,11 @@ void CalcDeconvShape_Caffe(const std::vector<index_t> &input_shape,
index_t out_width = padded_out_width - out_pad_size[1];
out_shape->resize(4);
(*out_shape)[0] = input_shape[0];
(*out_shape)[1] = data_format == NCHW ? output_channel : out_height;
(*out_shape)[2] = data_format == NCHW ? out_height : out_width;
(*out_shape)[3] = data_format == NCHW ? out_width : output_channel;
(*out_shape)[1] =
data_format == DataFormat::NCHW ? output_channel : out_height;
(*out_shape)[2] = data_format == DataFormat::NCHW ? out_height : out_width;
(*out_shape)[3] =
data_format == DataFormat::NCHW ? out_width : output_channel;
}
}
......@@ -385,7 +397,7 @@ void CalDeconvOutputShapeAndPadSize(const std::vector<index_t> &input_shape,
MACE_CHECK(output_shape->size() == 4,
"deconv output shape shoud be 4-dims");
std::vector<index_t> &out_shape = *output_shape;
if (data_format == NCHW) {
if (data_format == DataFormat::NCHW) {
const index_t t = out_shape[1];
out_shape[1] = out_shape[3];
out_shape[3] = out_shape[2];
......
......@@ -199,7 +199,7 @@ class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
public:
explicit ConcatOp(OpConstructContext *context)
: ConcatOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ConcatKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......@@ -241,12 +241,12 @@ void RegisterConcat(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Concat")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
} else {
......
......@@ -231,9 +231,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcPaddingAndOutputSize(input->shape().data(),
NHWC,
DataFormat::NHWC,
filter->shape().data(),
OHWI,
DataFormat::OHWI,
dilations_.data(),
strides_.data(),
padding_type_,
......@@ -242,9 +242,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(),
NHWC,
DataFormat::NHWC,
filter->shape().data(),
OHWI,
DataFormat::OHWI,
paddings_.data(),
dilations_.data(),
strides_.data(),
......@@ -459,14 +459,13 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
"leakyrelu_coefficient", 0.0f)),
wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
MemoryType mem_type;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::Conv2dKernel<T>>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::Conv2dKernel<T>>();
}
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd(
......
......@@ -47,8 +47,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
const std::vector<index_t> output_shape = {1, 1, 1, 1};
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -60,8 +60,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
......@@ -105,8 +105,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
const std::vector<index_t> output_shape = {1, 3, 3, 1};
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -118,8 +118,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
......@@ -189,8 +189,8 @@ void TestNHWCSimple3x3WithoutBias() {
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -203,8 +203,8 @@ void TestNHWCSimple3x3WithoutBias() {
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
......@@ -256,8 +256,8 @@ void TestNHWCCombined3x3() {
net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -270,8 +270,8 @@ void TestNHWCCombined3x3() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
......@@ -321,8 +321,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
const std::vector<index_t> output_shape = {1, 1, 1, 1};
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -336,8 +336,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
......@@ -376,8 +376,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
const std::vector<index_t> output_shape = {1, 1, 1, 1};
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -391,8 +391,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
......@@ -459,8 +459,8 @@ void TestConv1x1() {
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -472,8 +472,8 @@ void TestConv1x1() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
......@@ -532,8 +532,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
false);
net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -552,8 +552,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -651,8 +651,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
float_bias_data,
true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
......@@ -667,8 +667,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -811,8 +811,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, T>("Bias", {output_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -828,8 +828,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -900,8 +900,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
......@@ -916,8 +916,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -979,8 +979,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Construct graph
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
......@@ -994,8 +994,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -1118,12 +1118,12 @@ void TestQuant(const index_t batch,
net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
in_channels}, true);
net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
OHWI,
DataFormat::OHWI,
"FilterOIHW",
OIHW);
DataFormat::OIHW);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputNCHW")
......@@ -1136,8 +1136,8 @@ void TestQuant(const index_t batch,
.AddIntArg("T", static_cast<int>(DT_FLOAT))
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeFilter")
.Input("Filter")
......
......@@ -117,7 +117,7 @@ class CropOp<DeviceType::GPU, T> : public Operation {
public:
explicit CropOp(OpConstructContext *context)
: Operation(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::CropKernel<T>>(
Operation::GetRepeatedArgs<int>("offset"));
} else {
......@@ -145,6 +145,24 @@ void RegisterCrop(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Crop", CropOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Crop")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
int has_data_format =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op, "has_data_format", 0);
if (!has_data_format ||
op->output_shape(0).dims_size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -42,13 +42,13 @@ void RunCrop(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input0",
NHWC,
DataFormat::NHWC,
"InputNCHW0",
NCHW);
DataFormat::NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1",
NHWC,
DataFormat::NHWC,
"InputNCHW1",
NCHW);
DataFormat::NCHW);
OpDefBuilder("Crop", "CropTest")
.Input("InputNCHW0")
.Input("InputNCHW1")
......@@ -62,8 +62,8 @@ void RunCrop(const std::vector<index_t> &input_shape,
net.RunOp(D);
if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
// Check
auto expected = net.CreateTensor<float>(expected_shape, expected_data);
......
......@@ -32,8 +32,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
OpsTestNet net;
net.AddInputFromArray<CPU, T>("Input", shape, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Cumsum", "CumsumTest")
.Input("InputNCHW")
......@@ -48,8 +48,8 @@ void SimpleTestWithDataFormat(const std::vector<index_t> &shape,
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
net.AddInputFromArray<CPU, T>("ExpectedOutput", shape, output);
ExpectTensorNear<T>(*net.GetOutput("ExpectedOutput"),
......
......@@ -173,7 +173,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::Deconv2dKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......@@ -197,7 +197,6 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
}
}
MaceStatus Run(OpContext *context) override {
......@@ -241,7 +240,7 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
&out_paddings,
nullptr,
model_type_,
NHWC);
DataFormat::NHWC);
return kernel_->Compute(context, input, filter, bias,
strides_.data(), in_paddings.data(), activation_,
......@@ -264,6 +263,30 @@ void RegisterDeconv2D(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Deconv2D")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
MACE_NOT_IMPLEMENTED;
}
FrameworkType framework_type =
static_cast<FrameworkType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*(context->operator_def()), "framework_type",
FrameworkType::TENSORFLOW));
if (framework_type == FrameworkType::TENSORFLOW) {
context->SetInputInfo(2, MemoryType::CPU_BUFFER,
DataType::DT_INT32);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL
}
......
......@@ -47,7 +47,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
// TODO(liutuo): remove the unused transform
net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
net.TransformFilterDataFormat<D, float>(
"Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
if (D == DeviceType::GPU) {
if (model_type == FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest")
......@@ -77,8 +78,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
}
net.RunOp(D);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
if (model_type == FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest")
......@@ -109,8 +110,8 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
auto expected = net.CreateTensor<float>(expected_shape, expected_data);
......@@ -380,8 +381,8 @@ void TestComplexDeconvNxN(const int batch,
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
false);
net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
int out_h = 0;
int out_w = 0;
......@@ -440,8 +441,8 @@ void TestComplexDeconvNxN(const int batch,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......
......@@ -96,7 +96,7 @@ class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
explicit DepthToSpaceOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthToSpaceKernel<T>>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -32,8 +32,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
// Construct graph
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -41,8 +41,8 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else {
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
......@@ -114,8 +114,8 @@ void RandomTest(const int block_size,
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputNCHW")
.AddIntArg("block_size", block_size)
......@@ -125,8 +125,8 @@ void RandomTest(const int block_size,
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("Input")
......
......@@ -188,9 +188,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
filter->dim(2) * filter->dim(3), filter->dim(0), filter->dim(1), 1};
if (paddings_.empty()) {
CalcPaddingAndOutputSize(input->shape().data(),
NHWC,
DataFormat::NHWC,
ohwi_shape.data(),
OHWI,
DataFormat::OHWI,
dilations_.data(),
strides_.data(),
padding_type_,
......@@ -199,9 +199,9 @@ class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(),
NHWC,
DataFormat::NHWC,
ohwi_shape.data(),
OHWI,
DataFormat::OHWI,
paddings_.data(),
dilations_.data(),
strides_.data(),
......@@ -375,14 +375,13 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::DepthwiseConv2dKernel<T>>();
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_ = make_unique<opencl::buffer::DepthwiseConv2dKernel<T>>();
}
context->set_output_mem_type(mem_type);
Tensor *filter_tensor = context->workspace()->GetTensor(
operator_def_->input(1));
if (filter_tensor != nullptr && filter_tensor->is_weight()) {
......@@ -393,8 +392,6 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
1,
OpenCLBufferType::DW_CONV2D_FILTER,
mem_type) == MaceStatus::MACE_SUCCESS);
} else {
context->SetInputOpenCLBufferType(1, OpenCLBufferType::DW_CONV2D_FILTER);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
......@@ -440,7 +437,40 @@ void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
DepthwiseConv2dOp, DeviceType::GPU, half);
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
.SetInputMemoryTypeSetter(
[](OpConditionContext *context) -> void {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->device_type() == DeviceType::GPU) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
} else {
mem_type = MemoryType::GPU_BUFFER;
}
auto filter_tensor = context->workspace()->GetTensor(
context->operator_def()->input(1));
if (filter_tensor == nullptr || !filter_tensor->is_weight()) {
context->SetInputOpenCLBufferType(
1, OpenCLBufferType::DW_CONV2D_FILTER);
}
}
context->set_output_mem_type(mem_type);
}));
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("DepthwiseConv2d")
.SetInputsDataFormatSelector(
[](OpConditionContext *context) -> std::vector<DataFormat> {
DataFormat op_data_format =
static_cast<DataFormat>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*context->operator_def(), "data_format",
static_cast<int>(DataFormat::NONE)));
return {op_data_format, DataFormat::OIHW, DataFormat::NONE};
}));
}
} // namespace ops
......
......@@ -39,8 +39,8 @@ void SimpleValidTest() {
true);
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -52,8 +52,8 @@ void SimpleValidTest() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
......@@ -127,8 +127,8 @@ void ComplexValidTest(index_t batch,
true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -141,8 +141,8 @@ void ComplexValidTest(index_t batch,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
......@@ -249,8 +249,8 @@ void TestNxNS12(const index_t height, const index_t width) {
{multiplier * channel},
true, false);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -267,8 +267,8 @@ void TestNxNS12(const index_t height, const index_t width) {
// Run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -389,9 +389,9 @@ void TestQuant(const index_t batch,
"Filter", {k_height, k_width, in_channels, multiplier}, true, false);
net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", NHWC, "InputNCHW", NCHW);
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.TransformFilterDataFormat<DeviceType::CPU, float>(
"Filter", HWIO, "FilterOIHW", OIHW);
"Filter", DataFormat::HWIO, "FilterOIHW", DataFormat::OIHW);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNCHW")
......@@ -405,7 +405,7 @@ void TestQuant(const index_t batch,
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", NCHW, "Output", NHWC);
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeFilter")
.Input("Filter")
......
......@@ -190,7 +190,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::DepthwiseDeconv2dKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......@@ -230,7 +230,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
&out_paddings,
nullptr,
CAFFE,
NHWC);
DataFormat::NHWC);
return kernel_->Compute(context,
input,
......
......@@ -39,7 +39,8 @@ void RunTestSimple(const int group,
// Add input data
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
net.TransformFilterDataFormat<D, float>(
"Filter", DataFormat::HWOI, "FilterOIHW", DataFormat::OIHW);
const index_t out_channels = expected_shape[3];
net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
......@@ -56,8 +57,8 @@ void RunTestSimple(const int group,
net.RunOp(D);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC,
"InputNCHW", NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("InputNCHW")
.Input("FilterOIHW")
......@@ -69,8 +70,8 @@ void RunTestSimple(const int group,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
auto expected = net.CreateTensor<float>(expected_shape, expected_data);
......@@ -193,8 +194,8 @@ void RandomTest(index_t batch,
{channel * multiplier},
bias_data, true, false);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("InputNCHW")
.Input("Filter")
......@@ -210,8 +211,8 @@ void RandomTest(index_t batch,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
......
......@@ -1145,7 +1145,7 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1);
MemoryType mem_type;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::EltwiseKernel<T>>(
type, coeff, scalar_input, scalar_input_index);
......
......@@ -69,7 +69,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
net.AddInputFromArray<D, T>("Input", shape, input);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, T>("Input", NHWC, "TInput", NCHW);
net.TransformDataFormat<D, T>(
"Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput")
.AddIntArg("T", DataTypeToEnum<T>::v())
......@@ -81,7 +82,8 @@ void SimpleTensorScalar(const ops::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
net.TransformDataFormat<D, DstType>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else {
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("Input")
......@@ -124,13 +126,15 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
.OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
.Output("TOutput");
if (shape0.size() > 1) {
net.TransformDataFormat<D, T>("Input0", NHWC, "TInput0", NCHW);
net.TransformDataFormat<D, T>(
"Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
op_builder.Input("TInput0");
} else {
op_builder.Input("Input0");
}
if (shape1.size() > 1) {
net.TransformDataFormat<D, T>("Input1", NHWC, "TInput1", NCHW);
net.TransformDataFormat<D, T>(
"Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
op_builder.Input("TInput1");
} else {
op_builder.Input("Input1");
......@@ -139,7 +143,8 @@ void SimpleTensorEltwise(const ops::EltwiseType type,
// Run
net.RunOp(D);
net.TransformDataFormat<D, DstType>("TOutput", NCHW, "Output", NHWC);
net.TransformDataFormat<D, DstType>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else {
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("Input0")
......@@ -560,7 +565,8 @@ void GPUOverflowTest(const ops::EltwiseType type,
net.AddInputFromArray<DeviceType::GPU, T>(
"Filter",
{output_shape.back(), shape0.back(), 3, 3},
std::vector<float>(output_shape.back() * shape0.back() * 9, 1));
std::vector<float>(output_shape.back() * shape0.back() * 9, 1),
true);
OpDefBuilder("Conv2D", "Conv2D")
.AddIntArg("T", DataTypeToEnum<T>::v())
.Input("EltOutput")
......@@ -636,8 +642,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", shape, false, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
......@@ -647,8 +653,8 @@ void RandomTensorScalar(const ops::EltwiseType type,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -690,10 +696,10 @@ void RandomTensorEltwise(const ops::EltwiseType type,
true,
true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
......@@ -705,8 +711,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -746,10 +752,10 @@ void Quantized(const std::vector<index_t> &shape,
true,
true);
net.TransformDataFormat<DeviceType::CPU, float>("Input0", NHWC, "TInput0",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>("Input1", NHWC, "TInput1",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input0", DataFormat::NHWC, "TInput0", DataFormat::NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input1", DataFormat::NHWC, "TInput1", DataFormat::NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
......@@ -761,8 +767,8 @@ void Quantized(const std::vector<index_t> &shape,
// Run
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeInput0")
.Input("Input0")
......
......@@ -14,7 +14,6 @@
#include "mace/core/operator.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/math.h"
namespace mace {
......@@ -44,27 +43,8 @@ class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
std::vector<index_t> output_shape(input_shape);
output_shape.insert(output_shape.begin() + axis_, 1);
bool has_data_format = Operation::GetOptionalArg<int>(
"has_data_format", 0) == 1;
if (has_data_format && output_shape.size() == 4) {
// only tensorflow support expand dim, so the default format is NHWC
// transform NHWC to NCHW
auto t_output_shape = TransposeShape<int64_t, int64_t>(output_shape,
{0, 3, 1, 2});
output->Resize(t_output_shape);
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
Transpose(&context->device()->cpu_runtime()->thread_pool(),
input_data, output_shape, {0, 3, 1, 2}, output_data);
} else {
output->Resize(output_shape);
Tensor::MappingGuard input_guard(input);
auto input_data = input->data<T>();
output->Copy<T>(input_data, input->size());
}
output->ReuseTensorBuffer(*input);
output->Reshape(output_shape);
return MaceStatus::MACE_SUCCESS;
}
......
......@@ -49,7 +49,8 @@ void Simple() {
net.AddInputFromArray<D, float>("Offset", {1}, offset, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<D, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
.Input("Scale")
......@@ -58,7 +59,8 @@ void Simple() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("Input")
......@@ -100,8 +102,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
......@@ -113,8 +115,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -151,8 +153,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
......@@ -164,8 +166,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -205,8 +207,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
......@@ -218,8 +220,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -254,11 +256,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
.Input("InputNCHW")
......@@ -270,8 +272,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......
......@@ -190,7 +190,7 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
explicit FullyConnectedOp(OpConstructContext *context)
: FullyConnectedOpBase(context) {
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
mem_type = MemoryType::GPU_IMAGE;
kernel_ = make_unique<opencl::image::FullyConnectedKernel<T>>();
} else {
......
......@@ -48,7 +48,8 @@ void Simple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
......@@ -129,8 +130,8 @@ void Random(const index_t batch,
net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true,
false);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW")
.Input("Weight")
......@@ -143,7 +144,8 @@ void Random(const index_t batch,
// run cpu
net.RunOp();
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>();
......@@ -215,8 +217,10 @@ void QuantRandom(const index_t batch,
net.AddRandomInput<CPU, float>(
"Weight", {out_channel, height, width, channels}, true);
net.AddRandomInput<CPU, float>("Bias", {out_channel}, true);
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformFilterDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
net.TransformDataFormat<CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.TransformFilterDataFormat<CPU, float>(
"Weight", DataFormat::OHWI, "WeightOIHW", DataFormat::OIHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW")
......@@ -226,7 +230,8 @@ void QuantRandom(const index_t batch,
.AddIntArg("T", DT_FLOAT)
.Finalize(net.NewOperatorDef());
net.RunOp();
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeWeight")
.Input("Weight")
......
......@@ -29,7 +29,8 @@ void Simple() {
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<D, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
.Input("InputNCHW")
......@@ -41,7 +42,8 @@ void Simple() {
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
// Check
......
......@@ -36,7 +36,7 @@ class LSTMCellOp<DeviceType::GPU, T> : public Operation {
Operation::GetOptionalArg<float>("scalar_input",
0.0));
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::LSTMCellKernel<T>>(forget_bias);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -518,14 +518,6 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::CPU, uint8_t);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
}
} // namespace ops
......
......@@ -23,7 +23,6 @@
#include "mace/ops/opencl/image/buffer_to_image.h"
#include "mace/ops/opencl/image/image_to_buffer.h"
#include "mace/ops/opencl/buffer/buffer_transform.h"
#include "mace/ops/common/transpose.h"
#include "mace/utils/memory.h"
namespace mace {
......@@ -48,7 +47,6 @@ class OpenCLBufferTransformer {
const OpenCLBufferType type,
const MemoryType out_mem_type,
const int wino_blk_size,
bool has_data_format,
Tensor *output) {
Workspace *ws = context->workspace();
DataType dt = DataTypeToEnum<T>::value;
......@@ -67,31 +65,11 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform CPU Buffer " << input->name()
<< " to GPU Buffer " << internal_tensor->name()
<< " with data type " << dt;
if (has_data_format && input->shape().size() == 4) {
// 1. (NCHW -> NHWC)
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(input->shape(),
dst_dims);
internal_tensor->Resize(output_shape);
internal_tensor->set_data_format(DataFormat::NHWC);
// TODO(liuqi): Only support float now
const float *input_ptr = input->data<float>();
Tensor::MappingGuard guard(internal_tensor);
float *internal_ptr = internal_tensor->mutable_data<float>();
MACE_RETURN_IF_ERROR(ops::Transpose(
&context->device()->cpu_runtime()->thread_pool(),
input_ptr,
input->shape(),
dst_dims,
internal_ptr));
} else {
internal_tensor->Resize(input->shape());
const uint8_t *input_ptr = input->data<uint8_t>();
Tensor::MappingGuard guard(internal_tensor);
uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
memcpy(internal_ptr, input_ptr, input->raw_size());
}
internal_tensor->Resize(input->shape());
const uint8_t *input_ptr = input->data<uint8_t>();
Tensor::MappingGuard guard(internal_tensor);
uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
memcpy(internal_ptr, input_ptr, input->raw_size());
// 2. convert the internal GPU Buffer to output.
return kernel_->Compute(
context, internal_tensor, type, wino_blk_size, output);
......@@ -108,30 +86,12 @@ class OpenCLBufferTransformer {
VLOG(2) << "Transform GPU Buffer " << internal_tensor.name()
<< " to CPU Buffer " << output->name()
<< " with data type " << dt;
if (has_data_format && internal_tensor.shape().size() == 4) {
// NHWC -> NCHW
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(internal_tensor.shape(),
dst_dims);
output->set_data_format(DataFormat::NCHW);
Tensor::MappingGuard guard(&internal_tensor);
const float *internal_ptr = internal_tensor.data<float>();
output->Resize(output_shape);
float *output_ptr = output->mutable_data<float>();
return ops::Transpose(&context->device()->cpu_runtime()->thread_pool(),
internal_ptr,
internal_tensor.shape(),
dst_dims,
output_ptr);
} else {
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
return MaceStatus::MACE_SUCCESS;
}
Tensor::MappingGuard guard(&internal_tensor);
const T *internal_ptr = internal_tensor.data<T>();
output->Resize(internal_tensor.shape());
T *output_ptr = output->mutable_data<T>();
memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T));
return MaceStatus::MACE_SUCCESS;
} else {
LOG(FATAL) << "Unexpected error: " << out_mem_type;
return MaceStatus::MACE_SUCCESS;
......@@ -172,7 +132,7 @@ MaceStatus TransformFilter(
input->MarkUnused();
return OpenCLBufferTransformer<T>(input->memory_type(), mem_type).
Transform(&op_context, input, buffer_type, mem_type, wino_blk_size,
DataFormat::DF_NONE, output);
output);
}
} // namespace ops
......
......@@ -71,14 +71,17 @@ MaceStatus EltwiseKernel<T>::Compute(
if (input1 == nullptr) {
input1_type = "INPUT_SCALAR";
} else {
MACE_CHECK(input0->dim_size() == input1->dim_size() ||
MACE_CHECK((input0->dim_size() == input1->dim_size()
&& input0->dim_size() == 4) ||
input0->dim_size() == 1 || input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
<< "Inputs of Eltwise op must be same shape or fulfill broadcast logic";
MACE_CHECK(type_ != EltwiseType::EQUAL)
<< "Eltwise op on GPU does not support EQUAL";
// broadcast
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
if (input0->size() != input1->size() ||
input0->dim_size() != input1->dim_size()) {
if (input0->size() < input1->size()
|| input0->dim_size() < input1->dim_size()) {
std::swap(input0, input1);
swapped = true;
}
......
......@@ -59,11 +59,6 @@ MaceStatus ReduceKernel<T>::Compute(
const Tensor *input,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
MACE_CHECK(input->dim_size() == 4,
"reduce gpu only support 4-dim input");
MACE_CHECK(axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2,
"reduce gpu only support 1,2-axis reduce");
index_t batch = input->dim(0);
const index_t in_height = input->dim(1);
const index_t in_width = input->dim(2);
......
......@@ -15,6 +15,7 @@
#include "mace/ops/ops_test_util.h"
#include "mace/core/memory_optimizer.h"
#include "mace/utils/memory.h"
#include "mace/core/net_def_adapter.h"
namespace mace {
namespace ops {
......@@ -175,26 +176,27 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() {
bool OpsTestNet::Setup(mace::DeviceType device) {
NetDef net_def;
for (auto &op_def : op_defs_) {
net_def.add_op()->CopyFrom(op_def);
auto target_op = net_def.add_op();
target_op->CopyFrom(op_def);
auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op_def, "has_data_format", 0);
auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op_def, "T", static_cast<int>(DT_FLOAT))
== static_cast<int>(DT_UINT8);
for (auto input : op_def.input()) {
if (ws_.GetTensor(input) != nullptr &&
!ws_.GetTensor(input)->is_weight()) {
auto input_info = net_def.add_input_info();
input_info->set_name(input);
auto has_data_format = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op_def, "has_data_format", 1);
auto is_quantized_op = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op_def, "T", static_cast<int>(DT_FLOAT))
== static_cast<int>(DT_UINT8);
if (has_data_format) {
if (is_quantized_op || device == DeviceType::GPU) {
input_info->set_data_format(NHWC);
input_info->set_data_format(static_cast<int>(DataFormat::NHWC));
} else {
input_info->set_data_format(NCHW);
input_info->set_data_format(static_cast<int>(DataFormat::NCHW));
}
} else {
input_info->set_data_format(DataFormat::DF_NONE);
input_info->set_data_format(static_cast<int>(DataFormat::NONE));
}
auto &shape = ws_.GetTensor(input)->shape();
for (auto d : shape) {
......@@ -202,6 +204,10 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
}
}
}
if (has_data_format) {
SetProtoArg<int>(target_op, "data_format",
static_cast<int>(DataFormat::AUTO));
}
}
if (!op_defs_.empty()) {
auto op_def = op_defs_.back();
......@@ -216,15 +222,21 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
}
}
}
NetDef adapted_net_def;
NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
net_def_adapter.AdaptNetDef(&net_def,
OpTestContext::Get()->GetDevice(device),
&adapted_net_def);
MemoryOptimizer mem_optimizer;
net_ = make_unique<SerialNet>(
op_registry_.get(),
&net_def,
&adapted_net_def,
&ws_,
OpTestContext::Get()->GetDevice(device),
&mem_optimizer);
MaceStatus status = (ws_.PreallocateOutputTensor(
net_def,
adapted_net_def,
&mem_optimizer,
OpTestContext::Get()->GetDevice(device)));
if (status != MaceStatus::MACE_SUCCESS) return false;
......@@ -267,15 +279,20 @@ MaceStatus OpsTestNet::RunOp() {
MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def,
const mace::DeviceType device) {
device_type_ = device;
NetDef adapted_net_def;
NetDefAdapter net_def_adapter(op_registry_.get(), &ws_);
net_def_adapter.AdaptNetDef(&net_def,
OpTestContext::Get()->GetDevice(device),
&adapted_net_def);
MemoryOptimizer mem_optimizer;
net_ = make_unique<SerialNet>(
op_registry_.get(),
&net_def,
&adapted_net_def,
&ws_,
OpTestContext::Get()->GetDevice(device),
&mem_optimizer);
MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor(
net_def,
adapted_net_def,
&mem_optimizer,
OpTestContext::Get()->GetDevice(device)));
MACE_RETURN_IF_ERROR(net_->Init());
......
......@@ -223,7 +223,7 @@ class OpsTestNet {
const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4, "input shape != 4");
if (src_format == NHWC && dst_format == NCHW) {
if (src_format == DataFormat::NHWC && dst_format == DataFormat::NCHW) {
index_t batch = input_shape[0];
index_t height = input_shape[1];
index_t width = input_shape[2];
......@@ -243,7 +243,8 @@ class OpsTestNet {
}
}
}
} else if (src_format == NCHW && dst_format == NHWC) {
} else if (src_format == DataFormat::NCHW &&
dst_format == DataFormat::NHWC) {
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
......@@ -281,7 +282,7 @@ class OpsTestNet {
input->is_weight());
const std::vector<index_t> input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4, "input shape != 4");
if (src_format == HWOI && dst_format == OIHW) {
if (src_format == DataFormat::HWOI && dst_format == DataFormat::OIHW) {
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t out_channels = input_shape[2];
......@@ -299,7 +300,8 @@ class OpsTestNet {
input_data[j * out_channels * in_channels + i];
}
}
} else if (src_format == OIHW && dst_format == HWOI) {
} else if (src_format == DataFormat::OIHW &&
dst_format == DataFormat::HWOI) {
index_t out_channels = input_shape[0];
index_t in_channels = input_shape[1];
index_t height = input_shape[2];
......@@ -317,7 +319,8 @@ class OpsTestNet {
input_data[j * height * width + i];
}
}
} else if (src_format == HWIO && dst_format == OIHW) {
} else if (src_format == DataFormat::HWIO &&
dst_format == DataFormat::OIHW) {
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t in_channels = input_shape[2];
......@@ -337,7 +340,8 @@ class OpsTestNet {
}
}
}
} else if (src_format == OHWI && dst_format == OIHW) {
} else if (src_format == DataFormat::OHWI &&
dst_format == DataFormat::OIHW) {
index_t out_channels = input_shape[0];
index_t height = input_shape[1];
index_t width = input_shape[2];
......
......@@ -179,7 +179,7 @@ class PadOp<DeviceType::GPU, T> : public Operation {
std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
float constant_value = Operation::GetOptionalArg<float>(
"constant_value", 0.0);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PadKernel<T>>(
type, paddings, constant_value);
} else {
......
......@@ -45,8 +45,8 @@ void SimpleConstant() {
// Run
net.RunOp(D);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
.Output("TOutput")
......@@ -58,8 +58,8 @@ void SimpleConstant() {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
auto output = net.GetTensor("Output");
......@@ -93,7 +93,8 @@ void Result(const std::vector<index_t> &input_shape,
if (D == DeviceType::CPU) {
t_input = "TInput";
t_output = "TOutput";
net.TransformDataFormat<DeviceType::CPU, T>(input, NHWC, t_input, NCHW);
net.TransformDataFormat<DeviceType::CPU, T>(
input, DataFormat::NHWC, t_input, DataFormat::NCHW);
}
OpDefBuilder("Pad", "PadTest")
......@@ -108,7 +109,8 @@ void Result(const std::vector<index_t> &input_shape,
net.RunOp(D);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, T>(t_output, NCHW, output, NHWC);
net.TransformDataFormat<DeviceType::CPU, T>(
t_output, DataFormat::NCHW, output, DataFormat::NHWC);
}
auto actual = net.GetTensor(output.c_str());
......@@ -172,8 +174,8 @@ TEST_F(PadTest, ComplexCPU) {
// Add input data
net.AddRepeatedInput<DeviceType::CPU, float>("Input", {1, 1, 1, 2}, 2);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
.Output("TOutput")
......@@ -184,8 +186,8 @@ TEST_F(PadTest, ComplexCPU) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto output = net.GetTensor("Output");
......@@ -209,8 +211,8 @@ void Complex(const std::vector<index_t> &input_shape,
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "TInput",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "TInput", DataFormat::NCHW);
OpDefBuilder("Pad", "PadTest")
.Input("TInput")
.Output("TOutput")
......@@ -222,8 +224,8 @@ void Complex(const std::vector<index_t> &input_shape,
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("TOutput", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"TOutput", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......
......@@ -270,9 +270,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcPaddingAndOutputSize(input_tensor->shape().data(),
NHWC,
DataFormat::NHWC,
filter_shape.data(),
OHWI,
DataFormat::OHWI,
dilations_.data(),
strides_.data(),
padding_type_,
......@@ -281,9 +281,9 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
} else {
paddings = paddings_;
CalcOutputSize(input_tensor->shape().data(),
NHWC,
DataFormat::NHWC,
filter_shape.data(),
OHWI,
DataFormat::OHWI,
paddings_.data(),
dilations_.data(),
strides_.data(),
......@@ -477,10 +477,9 @@ class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
public:
explicit PoolingOp(OpConstructContext *context)
: PoolingOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::PoolingKernel<T>>();
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::PoolingKernel<T>>();
}
}
......
......@@ -34,8 +34,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -50,8 +50,8 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected =
......@@ -68,8 +68,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -84,8 +84,8 @@ TEST_F(PoolingOpTest, MAX_SAME) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});
......@@ -102,8 +102,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -118,8 +118,8 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});
......@@ -136,8 +136,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -152,8 +152,8 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});
......@@ -174,8 +174,8 @@ void SimpleMaxPooling3S2() {
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -187,8 +187,8 @@ void SimpleMaxPooling3S2() {
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else if (D == DeviceType::GPU) {
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
......@@ -224,8 +224,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -240,8 +240,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -304,8 +304,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -320,8 +320,8 @@ TEST_F(PoolingOpTest, AVG_VALID) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>(
......@@ -373,8 +373,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNCHW")
......@@ -389,8 +389,8 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// run on cpu
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -563,7 +563,7 @@ void TestQuant(const index_t batch,
net.AddRandomInput<CPU, float>(
"Input", input_shape, false, false);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", NHWC, "InputNCHW", NCHW);
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.AddRandomInput<DeviceType::CPU, float>(
"OutputNCHW", input_shape, false, true, true);
......@@ -580,7 +580,7 @@ void TestQuant(const index_t batch,
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", NCHW, "Output", NHWC);
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <memory>
#include <set>
#include <vector>
#include "mace/core/future.h"
......@@ -872,7 +873,7 @@ class ReduceOp<DeviceType::GPU, T> : public ReduceOpBase {
public:
explicit ReduceOp(OpConstructContext *context)
: ReduceOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ReduceKernel<T>>(reduce_type_,
axis_,
keep_dims_);
......@@ -907,6 +908,34 @@ void RegisterReduce(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "Reduce", ReduceOp,
DeviceType::GPU, half);
#endif // MACE_ENABLE_OPENCL
MACE_REGISTER_OP_CONDITION(
op_registry,
OpConditionBuilder("Reduce")
.SetDevicePlacerFunc(
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
}
bool keep_dims =
ProtoArgHelper::GetOptionalArg<OperatorDef, bool>(
*op, "keepdims", false);
if (!keep_dims) {
return { DeviceType::CPU };
}
auto axis =
ProtoArgHelper::GetRepeatedArgs<OperatorDef, int>(
*op, "axis");
if (axis.size() != 2 || axis[0] != 1 || axis[1] != 2) {
return { DeviceType::CPU };
}
auto tensor_shape_info = context->tensor_shape_info();
if (tensor_shape_info->count(op->input(0)) == 0
|| tensor_shape_info->at(op->input(0)).size() != 4) {
return { DeviceType::CPU };
}
return { DeviceType::CPU, DeviceType::GPU };
}));
}
} // namespace ops
......
......@@ -38,7 +38,8 @@ void Simple(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Input", input_shape, input);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<D, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Reduce", "ReduceTest")
.Input("InputNCHW")
.AddIntsArg("axis", axis)
......@@ -49,7 +50,8 @@ void Simple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<D, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else {
OpDefBuilder("Reduce", "ReduceTest")
.Input("Input")
......@@ -289,8 +291,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Reduce", "ReduceTest")
.Input("InputNCHW")
.AddIntsArg("axis", axis)
......@@ -301,8 +303,8 @@ void RandomTest(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Reduce", "ReduceTest")
.Input("Input")
.AddIntsArg("axis", axis)
......@@ -353,7 +355,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
net.AddRandomInput<CPU, float>(
"Input", input_shape, false, false);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", NHWC, "InputNCHW", NCHW);
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.AddRandomInput<DeviceType::CPU, float>(
"OutputNCHW", input_shape, false, true, true);
......@@ -368,7 +370,7 @@ void TestQuant(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", NCHW, "Output", NHWC);
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
......
......@@ -51,7 +51,7 @@ MaceStatus Deconv2d<float>::Compute(const OpContext *context,
&out_pad_size,
&padded_out_shape,
framework_type_,
NCHW);
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......
......@@ -50,7 +50,7 @@ MaceStatus DepthwiseDeconv2d<float>::Compute(const OpContext *context,
&out_pad_size,
&padded_out_shape,
framework_type_,
NCHW);
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......@@ -185,7 +185,7 @@ MaceStatus GroupDeconv2d<float>::Compute(const OpContext *context,
&out_pad_size,
&padded_out_shape,
framework_type_,
NCHW);
DataFormat::NCHW);
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
......
......@@ -212,7 +212,7 @@ class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
"size", {-1, -1});
MACE_CHECK(size.size() == 2);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBicubicKernel<T>>(
align_corners, size[0], size[1]);
} else {
......
......@@ -31,8 +31,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
.Input("InputNCHW")
......@@ -42,8 +42,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
......@@ -60,8 +60,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
std::vector<float> input(48);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 4, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
.Input("InputNCHW")
......@@ -71,8 +71,8 @@ TEST_F(ResizeBicubicTest, CPUResizeBicubicWOAlignCornersFloat) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 2, 3, 3},
......@@ -92,8 +92,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
.Input("InputNCHW")
......@@ -104,8 +104,8 @@ TEST_F(ResizeBicubicTest, ResizeBicubicWAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
......@@ -133,8 +133,8 @@ void TestRandomResizeBicubic() {
net.AddRandomInput<D, float>("Input",
{batch, in_height, in_width, channels},
false, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBicubic", "ResizeBicubicTest")
.Input("InputNCHW")
......@@ -144,8 +144,8 @@ void TestRandomResizeBicubic() {
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
......
......@@ -346,7 +346,7 @@ class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
"size", {-1, -1});
MACE_CHECK(size.size() == 2);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeBilinearKernel<T>>(
align_corners, size[0], size[1]);
} else {
......
......@@ -31,8 +31,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
......@@ -42,8 +42,8 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
......@@ -60,8 +60,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
std::vector<float> input(24);
std::iota(begin(input), end(input), 0);
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
......@@ -72,8 +72,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
......@@ -100,8 +100,8 @@ void TestRandomResizeBilinear() {
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, in_height, in_width, channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
......@@ -111,8 +111,8 @@ void TestRandomResizeBilinear() {
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......@@ -155,8 +155,8 @@ void TestQuantizedResizeBilinear() {
true,
-1.f,
1.f);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
.Input("InputNCHW")
......@@ -166,8 +166,8 @@ void TestQuantizedResizeBilinear() {
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
......
......@@ -149,7 +149,7 @@ class ResizeNearestNeighborOp<DeviceType::GPU, T> : public Operation {
: Operation(context) {
bool align_corners = Operation::GetOptionalArg<bool>(
"align_corners", false);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::ResizeNearestNeighborKernel<T>>(
align_corners);
} else {
......
......@@ -32,8 +32,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
std::iota(begin(input), end(input), 0);
std::vector<int32_t> size = {1, 2};
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
......@@ -45,8 +45,8 @@ TEST_F(ResizeNearestNeighborTest, CPUResizeNearestNeighborWOAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
......@@ -64,8 +64,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
std::iota(begin(input), end(input), 0);
std::vector<int32_t> size = {1, 2};
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.AddInputFromArray<DeviceType::CPU, int32_t>("Size", {2}, size);
OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
......@@ -78,8 +78,8 @@ TEST_F(ResizeNearestNeighborTest, ResizeNearestNeighborWAlignCorners) {
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
// Check
auto expected = net.CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
......@@ -105,8 +105,8 @@ void TestRandomResizeNearestNeighbor() {
std::vector<int32_t> size = {20, 40};
net.AddRandomInput<D, float>("Input",
{batch, in_height, in_width, channels});
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
net.AddInputFromArray<D, int32_t>("Size", {2}, size);
OpDefBuilder("ResizeNearestNeighbor", "ResizeNearestNeighborTest")
.Input("InputNCHW")
......@@ -116,8 +116,8 @@ void TestRandomResizeNearestNeighbor() {
.Finalize(net.NewOperatorDef());
// Run on CPU
net.RunOp(DeviceType::CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
......
......@@ -100,11 +100,7 @@ class ScalarMathOp : public Operation {
coeff_(Operation::GetRepeatedArgs<float>("coeff")),
scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
scalar_input_index_(Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1)) {
if (D == DeviceType::GPU) {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
}
}
"scalar_input_index", 1)) {}
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
......
......@@ -414,10 +414,9 @@ class SoftmaxOp<DeviceType::GPU, T> : public Operation {
: Operation(context) {
bool use_log = (
Operation::GetOptionalArg<bool>("use_log", false));
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SoftmaxKernel<T>>(use_log);
} else {
context->set_output_mem_type(MemoryType::GPU_BUFFER);
kernel_ = make_unique<opencl::buffer::SoftmaxKernel<T>>(use_log);
}
}
......@@ -456,7 +455,7 @@ void RegisterSoftmax(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Softmax")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return { DeviceType::CPU, DeviceType::GPU };
......
......@@ -50,7 +50,8 @@ void Simple(bool use_log = false) {
if (D == DeviceType::CPU) {
// test 4d softmax
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -59,7 +60,8 @@ void Simple(bool use_log = false) {
// Run
net.RunOp(D);
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
......@@ -109,7 +111,8 @@ void Complex(const std::vector<index_t> &logits_shape,
net.AddRandomInput<D, float>("Input", logits_shape);
if (logits_shape.size() == 4) {
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
......@@ -127,7 +130,8 @@ void Complex(const std::vector<index_t> &logits_shape,
net.RunOp();
if (logits_shape.size() == 4) {
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
net.TransformDataFormat<CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
auto expected = net.CreateTensor<float>();
......
......@@ -307,7 +307,7 @@ class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
public:
explicit SpaceToBatchNDOp(OpConstructContext *context)
: SpaceToBatchOpBase(context) {
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToBatchKernel<T>>();
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -39,8 +39,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -53,8 +53,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
net.RunOp(D);
if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
......@@ -78,8 +78,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -92,8 +92,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
net.RunOp(D);
if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
}
// Check
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"));
......@@ -155,8 +155,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
net.RunOp(GPU);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -164,8 +164,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
......@@ -188,8 +188,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
net.RunOp(GPU);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -197,8 +197,8 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputCPU"),
......@@ -218,8 +218,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -227,8 +227,8 @@ void TestSpaceToBatchQuantize(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
......@@ -279,8 +279,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
1.f);
// run cpu
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -288,8 +288,8 @@ void TestoBatchToSpaceQuantize(const std::vector<index_t> &input_shape,
.AddIntsArg("block_shape", block_shape_data)
.Finalize(net.NewOperatorDef());
net.RunOp(CPU);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"OutputCPU", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "OutputCPU", DataFormat::NHWC);
// run quantize
OpDefBuilder("Quantize", "QuantizeInput")
......
......@@ -94,7 +94,7 @@ class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
explicit SpaceToDepthOp(OpConstructContext *context)
: Operation(context) {
int block_size = Operation::GetOptionalArg<int>("block_size", 1);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SpaceToDepthKernel<T>>(block_size);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -32,8 +32,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
// Construct graph
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputNCHW")
.Output("OutputNCHW")
......@@ -41,8 +41,8 @@ void RunSpaceToDepth(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
} else {
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
......@@ -107,8 +107,8 @@ void RandomTest(const int block_size,
// Add input data
net.AddRandomInput<D, float>("Input", shape);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
net.TransformDataFormat<DeviceType::CPU, float>(
"Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("InputNCHW")
.AddIntArg("block_size", block_size)
......@@ -118,8 +118,8 @@ void RandomTest(const int block_size,
// Run
net.RunOp();
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
net.TransformDataFormat<DeviceType::CPU, float>(
"OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
OpDefBuilder("SpaceToDepth", "SpaceToDepthTest")
.Input("Input")
......
......@@ -106,7 +106,7 @@ class SplitOp<DeviceType::GPU, T> : public Operation {
explicit SplitOp(OpConstructContext *context)
: Operation(context) {
int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
if (context->device()->gpu_runtime()->UseImageMemory()) {
if (context->GetOpMemoryType() == MemoryType::GPU_IMAGE) {
kernel_ = make_unique<opencl::image::SplitKernel<T>>(axis);
} else {
MACE_NOT_IMPLEMENTED;
......@@ -144,7 +144,7 @@ void RegisterSplit(OpRegistryBase *op_registry) {
op_registry,
OpConditionBuilder("Split")
.SetDevicePlacerFunc(
[](OpConstructContext *context) -> std::set<DeviceType> {
[](OpConditionContext *context) -> std::set<DeviceType> {
auto op = context->operator_def();
if (op->output_shape_size() != op->output_size()) {
return {DeviceType::CPU, DeviceType::GPU};
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册