提交 bfbe1a30 编写于 作者: 李寅

Merge branch 'unify-cpu-gpu' into 'master'

Unify cpu gpu

See merge request !877
......@@ -69,9 +69,9 @@ in one deployment file.
- The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
If there are more than one tensors, use one line for a tensor.
* - input_shapes
- The shapes of the input tensors, in NHWC order.
- The shapes of the input tensors, default is NHWC order.
* - output_shapes
- The shapes of the output tensors, in NHWC order.
- The shapes of the output tensors, default is NHWC order.
* - input_ranges
- The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
* - validation_inputs_data
......@@ -84,6 +84,10 @@ in one deployment file.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
* - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
- [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - output_data_formats
- [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - limit_opencl_kernel_time
- [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
* - obfuscate
......
# one yaml config file can contain multi device info
devices:
# The name of the device
nanopi:
# arm64 or armhf
target_abis: [arm64, armhf]
# device soc, you can get it from device manual
target_socs: RK3399
# device model full name
models: FriendlyElec Nanopi M4
# device ip address
address: 10.0.0.0
# login username
username: user
# login password, is required when you can login into device without password
password: 1234567
raspberry:
target_abis: [armv7l]
target_socs: BCM2837
models: Raspberry Pi 3 Model B Plus Rev 1.3
address: 10.0.0.1
username: user
password: 123456
......@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC
bool IsQuantizedModel(const NetDef &net_def) {
return
ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
== 1;
}
} // namespace mace
......@@ -55,6 +55,8 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_;
};
bool IsQuantizedModel(const NetDef &def);
} // namespace mace
#endif // MACE_CORE_ARG_HELPER_H_
......@@ -233,6 +233,11 @@ class Image : public BufferBase {
}
}
inline DataType dtype() const {
MACE_CHECK_NOTNULL(buf_);
return data_type_;
}
void *buffer() {
MACE_CHECK_NOTNULL(buf_);
return buf_;
......
......@@ -34,7 +34,7 @@ class Device {
#ifdef MACE_ENABLE_OPENCL
virtual OpenCLRuntime *opencl_runtime() = 0;
#endif
#endif // MACE_ENABLE_OPENCL
virtual CPURuntime *cpu_runtime() = 0;
virtual Allocator *allocator() = 0;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/memory_optimizer.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <sstream>
#include <unordered_set>
#include "mace/core/arg_helper.h"
#include "mace/core/macros.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze", "ExpandDims"
};
return kReuseOp.count(op_type) == 1;
}
void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
if (tensor_ref_count_.count(tensor_name) == 0) {
tensor_ref_count_.emplace(tensor_name, 1);
} else {
tensor_ref_count_[tensor_name] += 1;
}
}
void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
if (tensor_ref_count_.count(op_def->input(i)) == 1) {
tensor_ref_count_[op_def->input(i)] += 1;
}
}
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (tensor_ref_count_.count(op_def->output(i)) == 0) {
tensor_ref_count_.emplace(op_def->output(i), 0);
}
}
}
MemoryBlock MemoryOptimizer::CreateMemoryBlock(
std::vector<int64_t> shape,
DataType dt,
mace::MemoryType mem_type) {
MemoryBlock block;
#ifdef MACE_ENABLE_OPENCL
if (mem_type == MemoryType::GPU_IMAGE) {
std::vector<size_t> image_shape;
if (shape.size() == 2) {
shape = {shape[0], 1, 1, shape[1]};
} else {
MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
}
OpenCLUtil::CalImage2DShape(shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
block.set_x(image_shape[0]);
block.set_y(image_shape[1]);
return block;
}
#endif // MACE_ENABLE_OPENCL
MACE_UNUSED(mem_type);
int64_t op_mem_size = std::accumulate(shape.begin(),
shape.end(),
GetEnumTypeSize(dt),
std::multiplies<int64_t>());
block.set_x(op_mem_size);
block.set_y(1);
return block;
}
void MemoryOptimizer::Optimize(
const mace::OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types) {
MACE_LATENCY_LOGGER(2, "Optimize memory");
if (op_def->output_size() != op_def->output_shape_size()) {
VLOG(1) << op_def->name()
<< ": the number of output shape "
<< "is not equal to the number of output";
return;
}
auto device = static_cast<DeviceType>(op_def->device_type());
DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
*op_def,
"T",
static_cast<int>(DT_FLOAT)));
MACE_CHECK(
op_def->output_type_size() == 0 ||
op_def->output_size() == op_def->output_type_size(),
"operator output size != operator output type size",
op_def->output_size(),
op_def->output_type_size());
DataType dt;
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) {
dt = op_def->output_type(i);
} else {
dt = op_dtype;
}
int best_mem_id = -1;
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (device == DeviceType::GPU) {
mem_type = mem_types.at(op_def->output(i));
}
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
MemoryBlock best_mem_block;
if (IsMemoryReuseOp(op_def->type())) {
if (tensor_mem_map_.count(op_def->input(0)) == 1) {
best_mem_id = tensor_mem_map_[op_def->input(0)].first;
}
} else {
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
int64_t best_added_mem_size = LLONG_MAX;
int64_t best_wasted_mem_size = LLONG_MAX;
int64_t old_mem_size = 0, new_mem_size = 0;
MemoryBlock new_mem_block;
for (auto idle_mem_id : idle_blocks_) {
if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
if (mem_type == MemoryType::GPU_IMAGE) {
// GPU Image could reuse memory with same data type only
if (mem_blocks_[idle_mem_id].data_type() != dt) {
continue;
}
old_mem_size =
mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
op_mem_block.x()));
new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
op_mem_block.y()));
new_mem_size = new_mem_block.x() * new_mem_block.y();
} else {
old_mem_size = mem_blocks_[idle_mem_id].x();
new_mem_size = std::max(op_mem_size, old_mem_size);
new_mem_block.set_x(new_mem_size);
}
int64_t added_mem_size = new_mem_size - old_mem_size;
int64_t wasted_mem_size = new_mem_size - op_mem_size;
// minimize add_mem_size; if best_mem_add_size is 0,
// then minimize waste_mem_size
if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
|| (best_added_mem_size == 0 &&
wasted_mem_size < best_wasted_mem_size)) {
best_mem_id = idle_mem_id;
best_added_mem_size = added_mem_size;
best_wasted_mem_size = wasted_mem_size;
best_mem_block = new_mem_block;
}
}
}
if (best_added_mem_size <= op_mem_size) {
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
mem_blocks_[best_mem_id] = best_mem_block;
idle_blocks_.erase(best_mem_id);
} else {
best_mem_id = static_cast<int>(mem_blocks_.size());
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
best_mem_block.set_x(op_mem_block.x());
best_mem_block.set_y(op_mem_block.y());
mem_blocks_.push_back(best_mem_block);
}
}
if (best_mem_id != -1) {
if (mem_ref_count_.count(best_mem_id) == 1) {
mem_ref_count_[best_mem_id] += 1;
} else {
mem_ref_count_[best_mem_id] = 1;
}
tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
}
}
// de-refer input tensors
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
auto &input_name = op_def->input(i);
if (tensor_ref_count_.count(input_name) == 1) {
tensor_ref_count_[input_name] -= 1;
if (tensor_ref_count_.at(input_name) == 0 &&
tensor_mem_map_.count(input_name) == 1) {
int mem_id = tensor_mem_map_.at(input_name).first;
mem_ref_count_[mem_id] -= 1;
if (mem_ref_count_.at(mem_id) == 0) {
idle_blocks_.insert(mem_id);
}
} else {
MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
}
}
}
}
const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
return mem_blocks_;
}
const std::unordered_map<std::string, std::pair<int, DataType>>&
MemoryOptimizer::tensor_mem_map() const {
return tensor_mem_map_;
}
std::string MemoryOptimizer::DebugInfo() const {
auto memory_type_to_str = [](const MemoryType type) -> std::string {
if (type == MemoryType::CPU_BUFFER) {
return "CPU_BUFFER";
} else if (type == MemoryType::GPU_BUFFER) {
return "GPU_BUFFER";
} else if (type == MemoryType::GPU_IMAGE) {
return "GPU_IMAGE";
} else {
return "UNKNOWN";
}
};
std::stringstream sstream;
sstream << "\n";
size_t block_size = mem_blocks_.size();
for (size_t i = 0; i < block_size; ++i) {
sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
<< " ";
if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
"[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
} else {
sstream << "[" << mem_blocks_[i].x() << "]";
}
sstream << "\n";
}
return sstream.str();
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
#define MACE_CORE_MEMORY_OPTIMIZER_H_
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/core/types.h"
namespace mace {
class MemoryBlock {
public:
inline void set_mem_id(int mem_id) {
mem_id_ = mem_id;
}
inline int mem_id() const {
return mem_id_;
}
inline void set_data_type(DataType data_type) {
data_type_ = data_type;
}
inline DataType data_type() const {
return data_type_;
}
inline void set_mem_type(MemoryType mem_type) {
mem_type_ = mem_type;
}
inline MemoryType mem_type() const {
return mem_type_;
}
inline void set_x(int64_t x) {
x_ = x;
}
inline int64_t x() const {
return x_;
}
inline void set_y(int64_t y) {
y_ = y;
}
inline int64_t y() const {
return y_;
}
private:
int mem_id_;
DataType data_type_;
MemoryType mem_type_;
int64_t x_;
int64_t y_;
};
class MemoryOptimizer {
public:
static bool IsMemoryReuseOp(const std::string &op_type);
void UpdateTensorRef(const std::string &tensor_name);
void UpdateTensorRef(const OperatorDef *op_def);
void Optimize(const OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types);
const std::vector<MemoryBlock> &mem_blocks() const;
const std::unordered_map<std::string,
std::pair<int, DataType>> &tensor_mem_map() const;
std::string DebugInfo() const;
private:
MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
DataType dt,
MemoryType mem_type);
private:
std::unordered_map<std::string, int> tensor_ref_count_;
std::vector<MemoryBlock> mem_blocks_;
// tensor name : <mem_id, data_type>
// Buffer Memory do not different data type, so store the data type.
std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
std::unordered_map<int, int> mem_ref_count_;
std::set<int> idle_blocks_;
};
} // namespace mace
#endif // MACE_CORE_MEMORY_OPTIMIZER_H_
......@@ -18,6 +18,7 @@
#include "mace/core/future.h"
#include "mace/core/macros.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/core/op_context.h"
#include "mace/public/mace.h"
......@@ -25,13 +26,94 @@
#include "mace/utils/timer.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat data_format_flag,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
// Get available devices
auto available_devices = op_registry->AvailableDevices(op_def->type());
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (data_format_flag == NHWC &&
op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
construct_context->set_operator_def(op_def);
std::unique_ptr<Operation> op(
op_registry->CreateOperation(construct_context, device_type));
return std::move(op);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def,
Workspace *ws,
Device *target_device,
const NetMode mode)
MemoryOptimizer *mem_optimizer)
: NetBase(),
ws_(ws),
target_device_(target_device),
......@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(),
target_device->cpu_runtime()->use_gemmlowp())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// Create Operations
DeviceType target_device_type = target_device_->device_type();
// output tensor : related information
std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_map<std::string, std::string> transformed_map;
// add input information
MemoryType target_mem_type;
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
//
DataFormat data_format_flag = NHWC;
if (target_device_->device_type() == DeviceType::CPU) {
target_mem_type = MemoryType::CPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// Only could be NONE or NHWC
auto input_data_format = static_cast<DataFormat>(
input_info.data_format());
if (!is_quantize_model &&
input_data_format == NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
} else if (input_data_format == DataFormat::DF_NONE) {
data_format_flag = DataFormat::DF_NONE;
}
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#ifdef MACE_ENABLE_OPENCL
else { // GPU NOLINT[readability/braces]
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
// Create the Operation
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(target_device_type));
if (op_device == target_device_type) {
// Get available devices (sorted based on priority)
OperatorDef temp_def(operator_def);
auto available_devices = op_registry->AvailableDevices(temp_def.type());
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
DeviceType device_type = DeviceType::CPU;
construct_context.set_device(cpu_device_);
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context.set_device(target_device_);
break;
std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create operation
auto op = CreateOperation(op_registry,
&construct_context,
op_def,
data_format_flag,
is_quantize_model);
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
const DataType dt =
static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
// the outputs' memory type of the operation
MemoryType out_mem_type = construct_context.output_mem_type();
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether is the output tensor of other operation
if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
output_map.at(op_def->input(i)).dtype != dt) {
auto key = TransformedName(op_def->input(i), out_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_map.count(key) == 0) {
VLOG(1) << "Add Transform operation to transform tensor '"
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to " << out_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< dt;
std::string input_name = op_def->input(i);
std::string t_input_name =
TransformedName(input_name,
out_mem_type);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name,
dt, out_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
data_format_flag);
operators_.emplace_back(std::move(transform_op));
transformed_map.emplace(key, t_input_name);
output_mem_map[t_input_name] = out_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, transformed_map[key]);
}
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
temp_def.set_device_type(device_type);
construct_context.set_operator_def(&temp_def);
std::unique_ptr<Operation> op(
op_registry->CreateOperation(&construct_context, device_type, mode));
if (op) {
operators_.emplace_back(std::move(op));
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
output_mem_map[op_def->output(out_idx)] = out_mem_type;
output_map.emplace(
op_def->output(out_idx),
InternalOutputInfo(
out_mem_type,
dt,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
}
}
#endif // MACE_ENABLE_OPENCL
operators_.emplace_back(std::move(op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) {
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != DataType::DT_FLOAT) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << DataType::DT_FLOAT;
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
auto output_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
DataType::DT_FLOAT,
target_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
}
}
}
#endif // MACE_ENABLE_OPENCL
// Update output tensor reference
for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name());
}
// Do memory optimization
for (auto &op : operators_) {
VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
<< ", " << op->debug_def().type() << ">";
mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
}
VLOG(1) << mem_optimizer->DebugInfo();
}
MaceStatus SerialNet::Init() {
// TODO(liuqi): where to do memory reuse.
MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
OpInitContext init_context(ws_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
......@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() {
}
MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
// TODO(liuqi): In/Out Buffer Transform
MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net");
OpContext context(ws_, cpu_device_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter;
DeviceType device_type = op->device_type();
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
"<", device_type, ", ", op->debug_def().type(), ">",
". mem_id: ",
MakeListString(op->debug_def().mem_id().data(),
op->debug_def().mem_id().size()));
MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
"<", device_type, ", ", op->debug_def().type(),
", ",
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
">");
if (device_type == target_device_->device_type()) {
context.set_device(target_device_);
} else {
......@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
float max_v = std::numeric_limits<float>::lowest();
float min_v = std::numeric_limits<float>::max();
Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>();
auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) {
max_v = std::max(max_v, output_data[j]);
min_v = std::min(min_v, output_data[j]);
......@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
std::vector<int> bin_distribution(bin_size, 0);
float bin_v = (max_v - min_v) / bin_size;
Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>();
auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) {
int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
if (ind < 0)
ind = 0;
else if (ind > bin_size-1)
ind = bin_size-1;
bin_distribution[ind]++;
int index = static_cast<int>((output_data[j] - min_v) / bin_v);
if (index < 0)
index = 0;
else if (index > bin_size-1)
index = bin_size-1;
bin_distribution[index]++;
}
LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
<< "@@" << min_v << "," << max_v<< "@@"
......
......@@ -27,6 +27,7 @@ namespace mace {
class RunMetadata;
class Workspace;
class MemoryOptimizer;
class NetBase {
public:
......@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
const NetDef *net_def,
Workspace *ws,
Device *target_device,
const NetMode mode = NetMode::NORMAL);
MemoryOptimizer * mem_optimizer);
MaceStatus Init() override;
MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat input_format,
bool is_quantize_model = false);
protected:
Workspace *ws_;
Device *target_device_;
......
......@@ -23,16 +23,12 @@ namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr), ws_(ws), device_(nullptr) {}
OpConstructContext::OpConstructContext(OperatorDef *operator_def,
Workspace *ws,
Device *device)
: operator_def_(operator_def), ws_(ws), device_(device) {}
OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {}
Operation::Operation(OpConstructContext *context)
: operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
: operator_def_(context->operator_def())
{}
MaceStatus Operation::Init(OpInitContext *context) {
......@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
": Encountered a non-existing input tensor: ", input_str);
inputs_.push_back(tensor);
}
// TODO(liuqi): filter transform
for (int i = 0; i < operator_def_->output_size(); ++i) {
const std::string output_str = operator_def_->output(i);
if (ws->HasTensor(output_str)) {
// TODO(liuqi): Workspace should pre-allocate all of the output tensors
outputs_.push_back(ws->GetTensor(output_str));
} else {
MACE_CHECK(
......@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) {
}
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, context->device()->allocator(), output_type)));
if (i < operator_def_->output_shape_size()) {
std::vector<index_t>
shape_configured(operator_def_->output_shape(i).dims_size());
for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
}
ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
}
if (i < operator_def_->output_shape_size()) {
std::vector<index_t>
shape_configured(operator_def_->output_shape(i).dims_size());
for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
}
ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
}
}
return MaceStatus::MACE_SUCCESS;
......@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context,
DeviceType device_type,
const NetMode mode) const {
OperatorDef *operator_def = context->operator_def();
const DataType dtype = static_cast<DataType>(
DeviceType device_type) const {
auto operator_def = context->operator_def();
DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT)));
const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "mode", static_cast<int>(NetMode::NORMAL));
const NetMode op_mode = static_cast<NetMode>(op_mode_i);
VLOG(3) << "Creating operator " << operator_def->name() << "("
if (device_type == DeviceType::CPU && dtype == DT_HALF) {
int arg_size = operator_def->arg_size();
for (int i = 0; i < arg_size; ++i) {
if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type;
if (op_mode == mode) {
const std::string op_type = context->operator_def()->type();
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
std::string key = OpKeyBuilder(op_type)
.Device(device_type)
.TypeConstraint("T", dtype)
.Build();
if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key;
}
return registry_.at(op_type)->creators.at(key)(context);
} else {
return nullptr;
const std::string op_type = context->operator_def()->type();
MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered.");
std::string key = OpKeyBuilder(op_type)
.Device(device_type)
.TypeConstraint("T", dtype)
.Build();
if (registry_.at(op_type)->creators.count(key) == 0) {
LOG(FATAL) << "Key not registered: " << key;
}
return registry_.at(op_type)->creators.at(key)(context);
}
} // namespace mace
......@@ -33,14 +33,13 @@ namespace mace {
class OpConstructContext {
public:
explicit OpConstructContext(Workspace *ws);
OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
~OpConstructContext() = default;
inline void set_operator_def(OperatorDef *operator_def) {
inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
operator_def_ = operator_def;
}
inline OperatorDef *operator_def() const {
inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_;
}
......@@ -56,10 +55,19 @@ class OpConstructContext {
return device_;
}
inline void set_output_mem_type(MemoryType type) {
output_mem_type_ = type;
}
inline MemoryType output_mem_type() const {
return output_mem_type_;
}
private:
OperatorDef *operator_def_;
std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_;
Device *device_;
MemoryType output_mem_type_; // used for transform memory
};
// memory_optimizer, device
......@@ -131,14 +139,18 @@ class Operation {
}
inline void set_debug_def(
const std::shared_ptr<const OperatorDef> &operator_def) {
const std::shared_ptr<OperatorDef> &operator_def) {
operator_def_ = operator_def;
}
inline bool has_debug_def() const { return operator_def_ != nullptr; }
inline std::shared_ptr<OperatorDef> operator_def() {
return operator_def_;
}
protected:
std::shared_ptr<const OperatorDef> operator_def_;
std::shared_ptr<OperatorDef> operator_def_;
std::vector<const Tensor *> inputs_;
std::vector<Tensor *> outputs_;
......@@ -190,8 +202,7 @@ class OpRegistryBase {
std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context,
DeviceType device_type,
const NetMode mode) const;
DeviceType device_type) const;
template <class DerivedType>
static std::unique_ptr<Operation> DefaultCreator(
......
......@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN),
mem_type_(MemoryType::GPU_IMAGE) {
mem_type_(MemoryType::GPU_IMAGE),
scratch_image_manager_(new ScratchImageManager) {
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
......@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
return is_profiling_enabled_;
}
ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
return scratch_image_manager_.get();
}
} // namespace mace
......@@ -25,6 +25,7 @@
#include "mace/core/file_storage.h"
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/scratch_image.h"
#include "mace/proto/mace.pb.h"
#include "mace/utils/string_util.h"
#include "mace/utils/timer.h"
......@@ -82,6 +83,7 @@ class OpenCLRuntime {
uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner();
ScratchImageManager *scratch_image_manager() const;
bool is_opencl_avaliable();
// TODO(liuqi): remove this function in the future, make decision at runtime.
bool UseImageMemory();
......@@ -134,6 +136,7 @@ class OpenCLRuntime {
OpenCLVersion opencl_version_;
GPUType gpu_type_;
MemoryType mem_type_;
std::unique_ptr<ScratchImageManager> scratch_image_manager_;
// All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library.
std::shared_ptr<cl::Context> context_;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_util.h"
#include <utility>
#include "mace/utils/logging.h"
namespace mace {
namespace {
// [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic, H * W * (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1];
(*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* MIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[1]);
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[0]);
(*image_shape)[1] = 1;
}
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> *image_shape,
const int blk_size) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[2] * shape[3];
(*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic * H * W, (Oc + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[0]);
}
// [(Ic + 3) / 4 * H * W, Oc]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
(*image_shape)[1] = shape[0];
}
} // namespace
void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_block_size) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) {
case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape);
break;
case DW_CONV2D_FILTER:
CalDepthwiseConv2dFilterImageShape(shape, image_shape);
break;
case IN_OUT_CHANNEL:
CalInOutputImageShape(shape, image_shape);
break;
case ARGUMENT:
CalArgImageShape(shape, image_shape);
break;
case IN_OUT_HEIGHT:
CalInOutHeightImageShape(shape, image_shape);
break;
case IN_OUT_WIDTH:
CalInOutWidthImageShape(shape, image_shape);
break;
case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
break;
case WEIGHT_HEIGHT:
CalWeightHeightImageShape(shape, image_shape);
break;
case WEIGHT_WIDTH:
CalWeightWidthImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const mace::MemoryType mem_type) {
std::unique_ptr<OperatorDef> op(new OperatorDef);
std::string op_name = "mace_node_" + output_name;
op->set_name(op_name);
op->set_type("BufferTransform");
op->add_input(input_name);
op->add_output(output_name);
Argument *arg = op->add_arg();
arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
arg = op->add_arg();
arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg();
arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg();
arg->set_name("device");
arg->set_i(DeviceType::GPU);
if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape();
for (auto value : input_shape) {
shape->add_dims(value);
}
}
return std::move(op);
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#include <memory>
#include <string>
#include <vector>
#include "mace/core/types.h"
namespace mace {
enum OpenCLBufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
class OpenCLUtil {
public:
static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const MemoryType mem_type);
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/scratch_image.h"
#include <utility>
#include <vector>
namespace mace {
ScratchImageManager::ScratchImageManager() = default;
ScratchImageManager::~ScratchImageManager() = default;
Image *ScratchImageManager::Spawn(
Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id) {
// TODO(liuqi): not optimal memory reuse strategy
int found_image_idx = -1;
int image_count = static_cast<int>(reference_count_.size());
for (int i = 0; i < image_count; ++i) {
int count = reference_count_[i];
if (count == 0 && images_.at(count)->dtype() == dt) {
auto image_shape = images_.at(count)->image_shape();
if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
found_image_idx = i;
break;
}
}
}
// if not found
if (found_image_idx == -1) {
reference_count_.push_back(0);
images_[image_count] =
std::move(std::unique_ptr<Image>(new Image(allocator)));
if (images_.at(image_count)->Allocate(shape, dt) !=
MaceStatus::MACE_SUCCESS) {
return nullptr;
}
found_image_idx = image_count;
VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
<< "<" << dt << ">";
}
reference_count_[found_image_idx] += 1;
*id = found_image_idx;
return images_.at(found_image_idx).get();
}
void ScratchImageManager::Deactive(int id) {
MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
&& reference_count_[id] > 0,
"Image id ", id, " exceed the vector size ",
reference_count_.size());
reference_count_[id] -= 1;
}
ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
: manager_(manager), id_(-1) {}
ScratchImage::~ScratchImage() {
if (id_ >= 0) {
manager_->Deactive(id_);
}
}
Image* ScratchImage::Scratch(Allocator *allocator,
const std::vector<size_t> &shape,
const mace::DataType dt) {
return manager_->Spawn(allocator, shape, dt, &id_);
}
} // namespace mace
......@@ -12,39 +12,47 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
#ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include "mace/public/mace.h"
#include "mace/utils/utils.h"
#include "mace/core/buffer.h"
namespace mace {
class OpContext;
class Tensor;
class ScratchImageManager {
public:
ScratchImageManager();
~ScratchImageManager();
Image *Spawn(Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id);
namespace ops {
void Deactive(int id);
class OpenCLWinogradTransformKernel {
public:
virtual MaceStatus Compute(
OpContext *context,
const Tensor *input,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
private:
std::unordered_map<int, std::unique_ptr<Image>> images_;
std::vector<int> reference_count_;
};
class OpenCLWinogradInverseTransformKernel {
class ScratchImage {
public:
virtual MaceStatus Compute(
OpContext *context,
const std::vector<const Tensor*> &inputs,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
explicit ScratchImage(ScratchImageManager *);
~ScratchImage();
Image *Scratch(Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt);
private:
ScratchImageManager *manager_;
int id_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
#endif // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
......@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
}
} // namespace numerical_chars
enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 };
enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
class Tensor {
public:
......@@ -222,6 +222,25 @@ class Tensor {
return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
}
inline MemoryType memory_type() const {
MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
if (buffer_->OnHost()) {
return MemoryType::CPU_BUFFER;
} else if (typeid(*buffer_) == typeid(Image)) {
return MemoryType::GPU_IMAGE;
} else {
return MemoryType::GPU_BUFFER;
}
}
inline void set_data_format(DataFormat data_format) {
data_format_ = data_format;
}
inline DataFormat data_format() const {
return data_format_;
}
#ifdef MACE_ENABLE_OPENCL
inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), name_, " do not have image");
......@@ -488,6 +507,7 @@ class Tensor {
int32_t zero_point_;
float minval_;
float maxval_;
DataFormat data_format_; // used for 4D input/output tensor
MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
};
......
......@@ -18,6 +18,7 @@
#include <utility>
#include "mace/core/arg_helper.h"
#include "mace/core/memory_optimizer.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -27,13 +28,6 @@
namespace mace {
namespace {
bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
static const std::unordered_set<std::string> reuse_buffer_ops {
"Reshape", "Identity", "Squeeze"
};
return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
}
bool HasQuantizeOp(const NetDef &net_def) {
for (auto &op : net_def.op()) {
if (op.type() == "Quantize") {
......@@ -48,13 +42,14 @@ Workspace::Workspace() = default;
Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc,
DataType type) {
DataType type,
bool is_weight) {
if (HasTensor(name)) {
VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else {
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
false, name));
is_weight, name));
}
return GetTensor(name);
}
......@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
fused_buffer_ = true;
}
}
return MaceStatus::MACE_SUCCESS;
}
if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
MaceStatus status = CreateOutputTensorBuffer(net_def, device);
if (status != MaceStatus::MACE_SUCCESS) return status;
MaceStatus Workspace::PreallocateOutputTensor(
const mace::NetDef &net_def,
const mace::MemoryOptimizer *mem_optimizer,
Device *device) {
auto &mem_blocks = mem_optimizer->mem_blocks();
for (auto &mem_block : mem_blocks) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", memory type: " << mem_block.mem_type()
<< ", size: " << mem_block.x() << "x" << mem_block.y();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{static_cast<size_t>(mem_block.x()),
static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
VLOG(1) << "Preallocate buffer to tensors";
bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
tensor_mem.second.second,
false, tensor_mem.first));
if (mem_blocks[tensor_mem.second.first].mem_type()
== MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
if (mem_blocks[tensor_mem.second.first].mem_type()
== MemoryType::GPU_BUFFER ||
is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else {
tensor->set_data_format(DataFormat::NCHW);
}
}
tensor_map_[tensor_mem.first] = std::move(tensor);
}
if (device_type == DeviceType::CPU) {
// add quantize info for output tensors.
if (device->device_type() == DeviceType::CPU) {
for (const auto &op : net_def.op()) {
VLOG(2) << "Add quantize info for op: " << op.name();
MACE_CHECK(op.quantize_info().empty()
......@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
return MaceStatus::MACE_SUCCESS;
}
MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
Device *device) {
DeviceType device_type = device->device_type();
DataType dtype = DataType::DT_INVALID;
if (net_def.mem_arena().mem_block_size() > 0) {
// We use the data type of the first op with mem id,
// as CPU&GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type && !op.mem_id().empty()) {
const DataType op_dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "T", static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype;
// find first valid data type, break
break;
}
}
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
}
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (mem_block.device_type() == device_type) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", device type: " << mem_block.device_type()
<< ", memory type: " << mem_block.mem_type();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
}
VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type) {
if (!op.mem_id().empty()
&& ShouldPreallocateMemoryForOp(op)) {
auto mem_ids = op.mem_id();
int count = mem_ids.size();
for (int i = 0; i < count; ++i) {
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = dtype;
}
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
output_type, false, op.output(i)));
if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
} else {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
}
tensor_map_[op.output(i)] = std::move(tensor);
}
} else {
for (int i = 0; i < op.output().size(); ++i) {
MACE_CHECK(
op.output_type_size() == 0
|| op.output_size()
== op.output_type_size(),
"operator output size != operator output type size",
op.output_size(),
op.output_type_size());
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
op, "T", static_cast<int>(DT_FLOAT)));
}
CreateTensor(op.output(i),
device->allocator(),
output_type);
}
}
for (int output_idx = 0; output_idx < op.output_shape_size();
++output_idx) {
std::vector<index_t>
shape_configured(op.output_shape(output_idx).dims_size());
for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
shape_configured[dim] = op.output_shape(output_idx).dims(dim);
}
tensor_map_[op.output(output_idx)]->SetShapeConfigured(
shape_configured);
}
}
}
return MaceStatus::MACE_SUCCESS;
}
void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
......@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
tensor_buffer_.reset(nullptr);
}
void Workspace::RemoveTensor(const std::string &name) {
auto iter = tensor_map_.find(name);
if (iter != tensor_map_.end()) {
tensor_map_.erase(iter);
}
}
} // namespace mace
......@@ -27,6 +27,8 @@
namespace mace {
class MemoryOptimizer;
class Workspace {
public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
......@@ -36,7 +38,8 @@ class Workspace {
Tensor *CreateTensor(const std::string &name,
Allocator *alloc,
DataType type);
DataType type,
bool is_weight = false);
inline bool HasTensor(const std::string &name) const {
return tensor_map_.find(name) != tensor_map_.end();
......@@ -52,12 +55,19 @@ class Workspace {
Device *device,
const unsigned char *model_data);
MaceStatus PreallocateOutputTensor(const NetDef &net_def,
const MemoryOptimizer *mem_optimizer,
Device *device);
void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data,
Allocator *alloc);
void RemoveTensor(const std::string &name);
private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
Device *device);
......
......@@ -20,9 +20,11 @@
#include <memory>
#include "mace/core/net.h"
#include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/ops/ops_registry.h"
#include "mace/ops/transpose.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
......@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
// Check OpenCL avaliable
auto runtime = device->opencl_runtime();
if (!runtime->is_opencl_avaliable()) {
LOG(WARNING) << "The device does not support OpenCL";
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
......@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type);
if (mem_type == MemoryType::GPU_IMAGE) {
if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
auto opencl_max_image_size = runtime->GetMaxImage2DSize();
if (opencl_max_image_size.empty()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
const std::vector<int64_t> net_max_image_size =
ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
*net_def, "opencl_max_image_size", {0, 0});
if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
|| static_cast<uint64_t>(net_max_image_size[1])
> opencl_max_image_size[1]) {
LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
<< " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
}
return MaceStatus::MACE_SUCCESS;
}
......@@ -288,14 +269,17 @@ class MaceTensor::Impl {
public:
std::vector<int64_t> shape;
std::shared_ptr<float> data;
DataFormat format;
};
MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data) {
std::shared_ptr<float> data,
const DataFormat format) {
MACE_CHECK_NOTNULL(data.get());
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = shape;
impl_->data = data;
impl_->format = format;
}
MaceTensor::MaceTensor() {
......@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape();
impl_->data = other.data();
impl_->format = other.data_format();
}
MaceTensor::MaceTensor(const MaceTensor &&other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape();
impl_->data = other.data();
impl_->format = other.data_format();
}
MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
impl_->shape = other.shape();
impl_->data = other.data();
impl_->format = other.data_format();
return *this;
}
MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
impl_->shape = other.shape();
impl_->data = other.data();
impl_->format = other.data_format();
return *this;
}
......@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
DataFormat MaceTensor::data_format() const {
return impl_->format;
}
// Mace Engine
class MaceEngine::Impl {
public:
......@@ -355,6 +347,14 @@ class MaceEngine::Impl {
std::map<std::string, MaceTensor> *outputs,
RunMetadata *run_metadata);
private:
MaceStatus TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor);
MaceStatus TransposeOutput(const Tensor *output_tensor,
std::pair<const std::string, MaceTensor> *output);
private:
const unsigned char *model_data_;
size_t model_data_size_;
......@@ -363,11 +363,12 @@ class MaceEngine::Impl {
std::unique_ptr<Device> device_;
std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_;
std::map<std::string, mace::InputInfo> input_info_map_;
std::map<std::string, mace::OutputInfo> output_info_map_;
bool is_quantized_model_;
#ifdef MACE_ENABLE_HEXAGON
std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif
std::map<std::string, mace::InputInfo> input_info_map_;
std::map<std::string, mace::OutputInfo> output_info_map_;
MACE_DISABLE_COPY_AND_ASSIGN(Impl);
};
......@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
device_type_(config.impl_->device_type()),
device_(nullptr),
ws_(new Workspace()),
net_(nullptr)
net_(nullptr),
is_quantized_model_(false)
#ifdef MACE_ENABLE_HEXAGON
, hexagon_controller_(nullptr)
#endif
......@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
}
#endif
// mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def);
// Get input and output information.
for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info;
......@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_));
}
ws_->CreateTensor(MakeString("mace_input_node_", input_name),
device_->allocator(), DT_FLOAT);
ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
}
for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) {
......@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's outputs "
<< MakeString(MapKeys(output_info_map_));
}
ws_->CreateTensor(MakeString("mace_output_node_", output_name),
device_->allocator(), DT_FLOAT);
}
#ifdef MACE_ENABLE_HEXAGON
if (device_type_ == HEXAGON) {
......@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
device_.get(),
model_data));
MemoryOptimizer mem_optimizer;
// Init model
auto net = std::unique_ptr<NetBase>(new SerialNet(
op_registry_.get(),
net_def,
ws_.get(),
device_.get(),
NetMode::INIT));
MACE_RETURN_IF_ERROR(net->Init());
MACE_RETURN_IF_ERROR(net->Run());
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def,
ws_.get(),
device_.get()));
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
&mem_optimizer,
device_.get()));
MACE_RETURN_IF_ERROR(net_->Init());
#ifdef MACE_ENABLE_HEXAGON
}
......@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
#endif
}
MaceStatus MaceEngine::Impl::TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor) {
if (device_->device_type() == DeviceType::CPU &&
input.second.shape().size() == 4 &&
input.second.data_format() == NHWC &&
!is_quantized_model_) {
VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
input_tensor->set_data_format(DataFormat::NCHW);
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else if (
(is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
input.second.shape().size() == 4 &&
input.second.data_format() == DataFormat::NCHW) {
VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
input_tensor->set_data_format(DataFormat::NHWC);
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else {
input_tensor->set_data_format(input.second.data_format());
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
}
MaceStatus MaceEngine::Impl::TransposeOutput(
const mace::Tensor *output_tensor,
std::pair<const std::string, mace::MaceTensor> *output) {
// save output
if (output_tensor != nullptr && output->second.data() != nullptr) {
if (device_->device_type() == DeviceType::CPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
MACE_CHECK(output_tensor->data_format() == NCHW);
VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else if (device_->device_type() == DeviceType::GPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
VLOG(1) << "Transform output " << output->first << " from "
<< output_tensor->data_format() << " to "
<< output->second.data_format();
std::vector<int> dst_dims = {0, 3, 1, 2};
if (output_tensor->data_format() == NCHW) {
dst_dims = {0, 2, 3, 1};
}
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else {
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
std::memcpy(output->second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
}
MaceStatus MaceEngine::Impl::Run(
const std::map<std::string, MaceTensor> &inputs,
std::map<std::string, MaceTensor> *outputs,
......@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_));
}
Tensor *input_tensor =
ws_->GetTensor(MakeString("mace_input_node_", input.first));
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
}
Tensor *input_tensor = ws_->GetTensor(input.first);
MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
input_tensors.push_back(input_tensor);
}
for (auto &output : *outputs) {
......@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's outputs: "
<< MakeString(MapKeys(output_info_map_));
}
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first));
Tensor *output_tensor = ws_->GetTensor(output.first);
output_tensors.push_back(output_tensor);
}
#ifdef MACE_ENABLE_HEXAGON
......@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
}
#endif
for (auto &output : *outputs) {
Tensor *output_tensor =
ws_->GetTensor(MakeString("mace_output_node_", output.first));
Tensor *output_tensor = ws_->GetTensor(output.first);
// save output
if (output_tensor != nullptr && output.second.data() != nullptr) {
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output.second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(output.second.shape())
<< " != " << MakeString<int64_t>(shape);
std::memcpy(output.second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
}
return MaceStatus::MACE_SUCCESS;
}
......
......@@ -14,7 +14,6 @@ mace {
*mace*NetDef*;
*mace*MemoryType*;
*mace*DataType*;
*mace*MemoryArena*;
*mace*InputInfo*;
*mace*OutputInfo*;
*mace*OutputShape*;
......
......@@ -30,10 +30,8 @@ cc_library(
"arm/*_test.cc",
"ops_registry.cc",
"ops_test_util.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc",
"lstm_cell.cc",
"winograd_transform.cc",
"quantize.cc",
],
) + if_opencl_enabled(glob(
......@@ -41,10 +39,8 @@ cc_library(
"opencl/*.cc",
"opencl/image/*.cc",
"opencl/buffer/*.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc",
"lstm_cell.cc",
"winograd_transform.cc",
],
exclude = [
"opencl/*_test.cc",
......
......@@ -19,6 +19,7 @@
#include "mace/core/operator.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/activation.h"
#endif // MACE_ENABLE_OPENCL
......@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
"NOOP"));
auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f));
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(
new opencl::image::ActivationKernel<T>(type, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
......
......@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
}
if (D == DeviceType::CPU) {
OpDefBuilder("Activation", "ReluBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
OpDefBuilder("Activation", "ReluBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
}
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "ReluxBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef());
}
OpDefBuilder("Activation", "ReluxBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
}
net.AddRandomInput<D, float>("Alpha", {channels});
net.AddRandomInput<D, T>("Alpha", {channels}, true);
if (D == DeviceType::CPU) {
OpDefBuilder("Activation", "PreluBM")
.Input("Input")
.Input("Alpha")
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluBM")
.Input("InputImage")
.Input("AlphaImage")
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
OpDefBuilder("Activation", "PreluBM")
.Input("Input")
.Input("Alpha")
.Output("Output")
.AddStringArg("activation", "PRELU")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
}
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "TanhBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
}
OpDefBuilder("Activation", "TanhBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......@@ -310,27 +262,17 @@ void SigmoidBenchmark(
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
}
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "SigmoidBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
}
OpDefBuilder("Activation", "SigmoidBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......
......@@ -30,32 +30,14 @@ void TestSimpleRelu() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
......@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
......@@ -129,34 +93,15 @@ void TestSimpleRelux() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
......@@ -179,34 +124,15 @@ void TestSimpleReluRelux() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
......@@ -232,45 +158,36 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluTest")
.Input("InputImage")
.Input("AlphaImage")
.Output("OutputImage")
.Input("Input")
.Input("Alpha")
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Activation", "PreluTest")
.Input("Input")
.Input("InputNCHW")
.Input("Alpha")
.Output("Output")
.Output("OutputNCHW")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
}
if (D == DeviceType::CPU) {
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2},
{-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2},
{-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
......@@ -290,32 +207,14 @@ void TestSimpleTanh() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "TanhTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
OpDefBuilder("Activation", "TanhTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2},
......@@ -343,32 +242,14 @@ void TestSimpleSigmoid() {
"Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
OpDefBuilder("Activation", "SigmoidTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "SigmoidTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Run
net.RunOp(D);
auto expected = net.CreateTensor<float>(
{2, 2, 2, 2},
......
......@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
}
if (D == DeviceType::GPU) {
for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
MakeString("InputImage", i).c_str(),
ops::BufferType::IN_OUT_CHANNEL);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
......
......@@ -62,39 +62,15 @@ void SimpleAdd3() {
net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
{-0.1582, 2, 3, 4, 5, 6});
const int input_num = 4;
if (D == DeviceType::GPU) {
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
MakeString("InputImage", i),
ops::BufferType::IN_OUT_CHANNEL);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("AddN", "AddNTest")
.Input("Input0")
.Input("Input1")
.Input("Input2")
.Input("Input3")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
OpDefBuilder("AddN", "AddNTest")
.Input("Input0")
.Input("Input1")
.Input("Input2")
.Input("Input3")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
auto expected =
net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
......@@ -138,28 +114,10 @@ void RandomTest() {
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
MakeString("InputImage", i),
ops::BufferType::IN_OUT_CHANNEL);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
// run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
1e-2);
}
}
......
......@@ -19,6 +19,7 @@
#include "mace/core/operator.h"
#include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/batch_norm.h"
#endif // MACE_ENABLE_OPENCL
......@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
ActivationType activation = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", "NOOP"));
float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BatchNormKernel<T>(
epsilon, activation, relux_max_limit));
} else {
MACE_NOT_IMPLEMENTED;
}
// Transform filters
int input_size = operator_def_->input_size();
for (int i = 1; i < input_size; ++i) {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
bool not_folded = this->InputSize() == 5;
......
......@@ -36,13 +36,12 @@ void BatchNorm(
} else {
MACE_NOT_IMPLEMENTED;
}
net.AddRandomInput<D, T>("Scale", {channels});
net.AddRandomInput<D, T>("Offset", {channels});
net.AddRandomInput<D, T>("Mean", {channels});
net.AddRandomInput<D, T>("Var", {channels}, true);
net.AddRandomInput<D, T>("Scale", {channels}, true);
net.AddRandomInput<D, T>("Offset", {channels}, true);
net.AddRandomInput<D, T>("Mean", {channels}, true);
net.AddRandomInput<D, T>("Var", {channels}, true, true);
if (D == DeviceType::CPU) {
OpDefBuilder("BatchNorm", "BatchNormBM")
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input")
.Input("Scale")
.Input("Offset")
......@@ -50,30 +49,8 @@ void BatchNorm(
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// tuning
setenv("MACE_TUNING", "1", 1);
......
......@@ -28,10 +28,10 @@ void Simple() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f});
net.AddInputFromArray<D, float>("Offset", {1}, {2.0});
net.AddInputFromArray<D, float>("Mean", {1}, {10});
net.AddInputFromArray<D, float>("Var", {1}, {11.67f});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
......@@ -49,32 +49,17 @@ void Simple() {
net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
}
// Check
......@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Tuning
......@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
1e-5, 1e-4);
}
......@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
......@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
1e-1, 1e-2);
}
......@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
// tuning
......@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
1e-5, 1e-4);
}
......@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-1)
.Output("OutputImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
......@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
1e-1, 1e-2);
}
......
......@@ -32,23 +32,13 @@ void BMBatchToSpace(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
}
if (D == DeviceType::CPU) {
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input")
.Output("Output")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
}
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input")
.Output("Output")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
......
......@@ -19,6 +19,7 @@
#include "mace/core/operator.h"
#include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/bias_add.h"
#endif // MACE_ENABLE_OPENCL
......@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
: Operation(context),
data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
"data_format", NHWC))) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BiasAddKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
......
......@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
OpsTestNet net;
// Add input data
DataFormat data_format = NHWC;
if (D == DeviceType::CPU) {
data_format = NCHW;
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else {
MACE_NOT_IMPLEMENTED;
}
net.AddRandomInput<D, T>("Bias", {channels}, true);
net.AddRandomInput<D, T>("Bias", {channels}, true, true);
if (D == DeviceType::CPU) {
OpDefBuilder("BiasAdd", "BiasAddBM")
OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("Input")
.Input("Bias")
.AddIntArg("data_format", NCHW)
.AddIntArg("data_format", data_format)
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("InputImage")
.Input("BiasImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up
for (int i = 0; i < 5; ++i) {
......
......@@ -28,7 +28,7 @@ void BiasAddSimple() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
......@@ -44,22 +44,13 @@ void BiasAddSimple() {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage")
.Input("BiasImage")
.Output("OutputImage")
.Input("Input")
.Input("Bias")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
MACE_NOT_IMPLEMENTED;
}
......@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
// Run on gpu
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage")
.Input("BiasImage")
.Output("OutputImage")
.Input("Input")
.Input("Bias")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
......@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
// Run on gpu
OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage")
.Input("BiasImage")
.Output("OutputImage")
.Input("Input")
.Input("Bias")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace test
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
#include "mace/ops/opencl/image/image_to_buffer.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BufferInverseTransformOp;
template <typename T>
class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
public:
explicit BufferInverseTransformOp(OpConstructContext *context)
: Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ImageToBuffer<T>);
} else {
kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
ops::BufferType type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
return kernel_->Compute(context, input, type,
wino_blk_size_, output);
}
private:
const int wino_blk_size_;
std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
};
void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, half);
}
} // namespace ops
} // namespace mace
......@@ -14,6 +14,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
mace::testing::StopTiming();
OpsTestNet net;
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
net.AddRandomInput<D, T>("Input",
{out_channel, in_channel, height, width});
// Create output
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpDefBuilder("BufferToImage", "BufferToImageBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
auto transform_func = [&]() {
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context,
net.ws()->GetTensor("Input"),
OpenCLBufferType::IN_OUT_CHANNEL,
MemoryType::GPU_IMAGE,
0,
b2i_output);
};
// Warm-up
net.Setup(D);
for (int i = 0; i < 5; ++i) {
net.Run();
transform_func();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.Run();
transform_func();
}
net.Sync();
}
......
......@@ -14,6 +14,7 @@
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/opencl/buffer_transformer.h"
namespace mace {
namespace ops {
......@@ -21,31 +22,27 @@ namespace test {
namespace {
template <DeviceType D, typename T>
void TestBidirectionTransform(const int type,
void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
net.AddRandomInput<D, T>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run
net.RunOp(D);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
} // namespace
TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1});
TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{1});
}
TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11});
TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
{11});
}
TEST(BufferToImageTest, ArgMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11});
TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{11});
}
TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256});
TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{256});
}
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
{1, 2, 3, 1});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
{1, 2, 3, 3});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
{3, 2, 3, 3});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
{3, 13, 17, 128});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL,
{3, 64, 64, 256});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{5, 3, 1, 1});
}
TEST(BufferToImageTest, Filter1x1Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{13, 17, 1, 1});
}
TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{512, 128, 1, 1});
}
TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{3, 5, 3, 3});
}
TEST(BufferToImageTest, Filter3x3Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{17, 13, 3, 3});
}
TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER,
TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{256, 128, 3, 3});
}
TEST(BufferToImageTest, WeightWidthSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
{1, 3, 3, 3});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{1, 3, 3, 3});
}
TEST(BufferToImageTest, WeightWidthMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
{11, 13, 13, 17});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{11, 13, 13, 17});
}
TEST(BufferToImageTest, WeightWidthLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH,
{64, 64, 11, 13});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{64, 64, 11, 13});
}
TEST(BufferToImageTest, WeightHeightSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
{2, 1, 1, 1});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{2, 1, 1, 1});
}
TEST(BufferToImageTest, WeightHeightMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
{11, 13, 13, 17});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{11, 13, 13, 17});
}
TEST(BufferToImageTest, WeightHeightLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT,
{64, 16, 11, 13});
TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{64, 16, 11, 13});
}
namespace {
template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type,
void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
net.AddRandomInput<D, float>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run
net.RunOp(D);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Run
net.RunOp(D);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
} // namespace
TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
{11});
TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
OpenCLBufferType::ARGUMENT,
{11});
}
namespace {
template <DeviceType D, typename T>
void TestStringHalfBidirectionTransform(const int type,
void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape,
const unsigned char *input_data) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
const half *h_data = reinterpret_cast<const half *>(input_data);
net.AddInputFromArray<D, half>("Input", input_shape,
std::vector<half>(h_data, h_data + 2));
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run
net.RunOp(D);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Transform
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
// Run
net.RunOp(D);
// Inverse Transform
Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
......@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40,
};
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT,
{2}, input_data);
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
OpenCLBufferType::ARGUMENT, {2}, input_data);
}
} // namespace test
......
......@@ -15,8 +15,7 @@
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_transform.h"
#include "mace/ops/opencl/image/buffer_to_image.h"
#include "mace/ops/opencl/buffer_transformer.h"
namespace mace {
namespace ops {
......@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
public:
explicit BufferTransformOp(OpConstructContext *context)
: Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::BufferToImage<T>);
} else {
kernel_.reset(new opencl::buffer::BufferTransform<T>);
}
}
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
"mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
ops::BufferType type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
auto type =
static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(CONV2D_FILTER)));
return kernel_->Compute(context, input, type,
wino_blk_size_, output);
MemoryType in_mem_type = context->workspace()->GetTensor(
operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
}
private:
const int wino_blk_size_;
std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
MemoryType out_mem_type_;
};
......
......@@ -15,6 +15,7 @@
#include <cstring>
#include "gtest/gtest.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {
namespace {
template <typename OrgType, typename DstType>
void TestBidirectionTransform(const int type,
void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("TransformedOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<DstType>::value)
.Finalize(net.NewOperatorDef());
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
// Run
net.RunOp(DeviceType::GPU);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("TransformedOutput")
.Output("Output")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<OrgType>::value)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::GPU);
Tensor *bt_output = net.ws()->CreateTensor(
"BtOutput", context.device()->allocator(),
DataTypeToEnum<DstType>::value);
OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_BUFFER, 0, bt_output);
// Inverse Transform
Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<OrgType>::value);
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, output);
if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
......@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type,
} // namespace
TEST_F(BufferTransformTest, FloatToHalf) {
TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL,
TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
TEST_F(BufferTransformTest, HalfToHalf) {
TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
namespace {
template <typename T>
void TestArgumentTransform(const index_t input_size) {
OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("Output")
.AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data
net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
// Run
net.RunOp(DeviceType::GPU);
Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
0, output);
auto output_tensor = net.GetOutput("Output");
index_t expected_size = RoundUp<index_t>(input_size, 4);
EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]);
EXPECT_EQ(expected_size, output->buffer_shape()[0]);
// Check
ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor,
ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
1e-3, 1e-4);
}
} // namespace
......
......@@ -36,23 +36,11 @@ void ChannelShuffle(
MACE_NOT_IMPLEMENTED;
}
if (D == DeviceType::CPU) {
OpDefBuilder("Softmax", "SoftmaxBM")
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("Input")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up
for (int i = 0; i < 5; ++i) {
......
......@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("OutputImage")
.Input("Input")
.Output("Output")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check
auto expected = net.CreateTensor<float>(
{1, 1, 2, 16},
......
......@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
public:
explicit ConcatOpBase(OpConstructContext *context)
: Operation(context),
axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
axis_(Operation::GetOptionalArg<int>("axis", 3)),
checked_(false) {}
protected:
void Validate() {
......@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {
protected:
int axis_;
bool checked_;
};
template <DeviceType D, class T>
......@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context);
Validate();
if (!checked_) {
Validate();
if (this->Input(0)->dim_size() == 4) {
if (axis_ == 3) axis_ = 1;
else if (axis_ == 2) axis_ = 3;
else if (axis_ == 1) axis_ = 2;
}
checked_ = true;
}
const std::vector<const Tensor *> &inputs = this->Inputs();
Tensor *output = this->Output(0);
const Tensor *input0 = inputs.front();
......
......@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);
namespace {
template <typename T>
void OpenclConcatHelper(int iters,
void OpenCLConcatHelper(int iters,
const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1,
int concat_dim) {
......@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0")
.Input("InputImage1")
.Input("Input0")
.Input("Input1")
.AddIntArg("axis", concat_dim)
.Output("OutputImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
......@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
OpenCLConcatHelper<TYPE>(iters, shape, shape, 3); \
} \
MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
......
......@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
static unsigned int seed = time(NULL);
int dim = 5;
int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim;
int axis = 1;
// Construct graph
OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest");
......@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
static unsigned int seed = time(NULL);
int dim = 4;
int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim;
int axis = 1;
int axis_arg = 3; // NHWC
// Construct graph
OpsTestNet net;
......@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
std::vector<index_t> output_shape = input_shapes[0];
output_shape[axis] = concat_axis_size;
net.AddRandomInput<DeviceType::CPU, float>(
"Output", output_shape, true, true);
"Output", output_shape, false, true, true);
auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) {
builder = builder.Input(MakeString("Input", i));
}
builder.AddIntArg("axis", axis)
builder.AddIntArg("axis", axis_arg)
.Output("Output")
.Finalize(net.NewOperatorDef());
......@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
net.RunOp();
net.AddRandomInput<DeviceType::CPU, uint8_t>(
"QuantizedOutput", output_shape, true, true);
"QuantizedOutput", output_shape, false, true, true);
auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
for (int i = 0; i < num_inputs; ++i) {
q_builder = q_builder.Input(MakeString("QuantizedInput", i));
......@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i);
const std::string image_name = MakeString("InputImage", i);
concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
ops::BufferType::IN_OUT_CHANNEL);
}
auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) {
const std::string image_name = MakeString("InputImage", i);
const std::string image_name = MakeString("Input", i);
builder = builder.Input(image_name);
}
builder.AddIntArg("axis", axis)
.Output("OutputImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check
auto output = net.GetOutput("Output");
......
......@@ -38,8 +38,9 @@
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/conv_2d.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/buffer/conv_2d.h"
#include "mace/ops/opencl/image/conv_2d.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
......@@ -958,13 +959,45 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
: ConvPool2dOpBase(context),
activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation",
"NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
"NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::Conv2dKernel<T>);
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
}
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd(
context->device()->opencl_runtime(),
context->workspace()->GetTensor(
operator_def_->input(1))->shape(),
std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
operator_def_->output_shape(0).dims().end()),
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT);
......@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
return kernel_->Compute(context, input, filter, bias,
strides_.data(), padding_type_, paddings_,
dilations_.data(), activation_, relux_max_limit_,
output);
wino_block_size_, output);
}
private:
const ActivationType activation_;
const float relux_max_limit_;
std::unique_ptr<OpenCLConv2dKernel> kernel_;
int wino_block_size_;
private:
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
......
......@@ -49,11 +49,10 @@ void Conv2d(int iters,
}
net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h,
kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}, true);
if (D == DeviceType::CPU) {
OpDefBuilder("Conv2D", "Conv2dTest")
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
......@@ -63,26 +62,6 @@ void Conv2d(int iters,
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
net.Setup(D);
......@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
"Input", {batch, height, width, channels});
net.GetTensor("Input")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Filter", {output_channels, kernel_h, kernel_w, channels});
"Filter", {output_channels, kernel_h, kernel_w, channels}, true);
net.GetTensor("Filter")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels});
net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
......
此差异已折叠。
......@@ -24,7 +24,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format,
const index_t *filter_shape,
const DataFormat filter_format,
const FilterDataFormat filter_format,
const int *dilations,
const int *strides,
Padding padding,
......@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format,
const index_t *filter_shape,
const DataFormat filter_format,
const FilterDataFormat filter_format,
const int *padding_size,
const int *dilations,
const int *strides,
......
......@@ -35,7 +35,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format,
const index_t *filter_shape,
const DataFormat filter_format,
const FilterDataFormat filter_format,
const int *dilations,
const int *strides,
Padding padding,
......@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format,
const index_t *filter_shape,
const DataFormat filter_format,
const FilterDataFormat filter_format,
const int *padding_size,
const int *dilations,
const int *strides,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
TEST(CoreTest, INIT_MODE) {
std::vector<OperatorDef> op_defs;
Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
std::unique_ptr<Tuner<uint32_t>> tuner;
Workspace ws;
op_defs.emplace_back(OperatorDef());
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
.AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = ws.CreateTensor("Input", device->allocator(),
DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3});
{
Tensor::MappingGuard input_mapper(input);
float *input_data = input->mutable_data<float>();
std::fill(input_data, input_data + input->size(), 1);
}
op_defs.emplace_back(OperatorDef());
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("Output")
.AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
.Finalize(&op_defs[op_defs.size() - 1]);
NetDef net_def;
for (auto &op_def : op_defs) {
net_def.add_op()->CopyFrom(op_def);
}
std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
auto net = std::unique_ptr<NetBase>(new SerialNet(
op_registry.get(), &net_def, &ws, device,
NetMode::INIT));
MaceStatus status = net->Init();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
status = net->Run();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
net = std::unique_ptr<NetBase>(new SerialNet(
op_registry.get(), &net_def, &ws, device));
status = net->Init();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
status = net->Run();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
1e-5);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);
namespace {
template <typename T>
void OpenclCropHelper(int iters,
void OpenCLCropHelper(int iters,
const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1,
int crop_axis,
......@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Crop", "CropBM")
.Input("InputImage0")
.Input("InputImage1")
.Input("Input0")
.Input("Input1")
.AddIntArg("axis", crop_axis)
.AddIntsArg("offset", {offset})
.Output("OutputImage")
.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
......@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
_##TYPE(int iters) { \
std::vector<index_t> shape0 = {N, H, W, C}; \
std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2}; \
OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \
OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \
} \
MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
##_##TYPE)
......
......@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape,
net.AddRandomInput<D, float>("Input1", input_shape2);
if (D == GPU) {
BufferToImage<D, float>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Crop", "CropTest")
.Input("InputImage0")
.Input("InputImage1")
.Output("OutputImage")
.Input("Input0")
.Input("Input1")
.Output("Output")
.AddIntsArg("offset", offset)
.AddIntArg("axis", axis)
.Finalize(net.NewOperatorDef());
......@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
// Run
net.RunOp(D);
if (D == GPU) {
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) {
if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC);
}
......
......@@ -30,6 +30,7 @@
#include "mace/ops/arm/deconv_2d_neon.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/deconv_2d.h"
#endif // MACE_ENABLE_OPENCL
......@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
public:
explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::Deconv2dKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
......
......@@ -47,40 +47,21 @@ static void Deconv2d(int iters,
}
net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h,
kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}, true);
net.AddInputFromArray<D, int32_t>("OutputShape", {4},
{batch, out_h, out_w, output_channels});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("OutputShape")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("Input")
.Input("Filter")
.Input("OutputShape")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
}
{batch, out_h, out_w, output_channels},
true);
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("Input")
.Input("Filter")
.Input("OutputShape")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.Setup(D);
// Warm-up
......
......@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
ops::FrameworkType model_type) {
OpsTestNet net;
// Add input data
const index_t batch = input_shape[0];
const index_t out_channels = filter_shape[2];
net.AddInputFromArray<D, float>("Input", input_shape, input_data);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data);
net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
// TODO(liutuo): remove the unused transform
net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
ops::BufferType::CONV2D_FILTER);
if (model_type == ops::FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.Input("Input")
.Input("FilterOIHW")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("padding_values", padding_size)
.AddIntArg("framework_type", model_type)
.Finalize(net.NewOperatorDef());
} else {
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("Input")
.Input("FilterOIHW")
.Input("OutputShape")
.Input("BiasImage")
.Output("OutputImage")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("padding_values", padding_size)
......@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef());
}
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
......@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.AddIntArg("framework_type", model_type)
.Finalize(net.NewOperatorDef());
} else {
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputNCHW")
......@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch,
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, T>("Bias", {output_channels});
"Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, T>("Bias", {output_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW);
int out_h = 0;
......@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch,
output_shape.push_back(out_h);
output_shape.push_back(out_w);
output_shape.push_back(output_channels);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
} else {
paddings.push_back(padding);
paddings.push_back(padding);
......@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch,
expected->Copy(*net.GetOutput("Output"));
// run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
if (model_type == ops::FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("framework_type", model_type)
......@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch,
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("Input")
.Input("Filter")
.Input("OutputShape")
.Input("BiasImage")
.Output("OutputImage")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntArg("framework_type", model_type)
......@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch,
// Run on device
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4,
1e-4);
};
......
......@@ -36,23 +36,12 @@ void DepthToSpace(
MACE_NOT_IMPLEMENTED;
}
if (D == DeviceType::CPU) {
OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
.Input("Input")
.Output("Output")
.AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
.Input("InputImage")
.Output("Output")
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up
for (int i = 0; i < 5; ++i) {
......
......@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
"Output", NHWC);
} else {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImage")
.Output("OutputImage")
.Input("Input")
.Output("Output")
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
}
auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -134,28 +128,23 @@ void RandomTest(const int block_size,
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImg")
.Input("Input")
.AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("OutputImg")
.Output("GPUOutput")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-5);
*net.GetOutput("GPUOutput"), 1e-5);
} else {
ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-3, 1e-4);
*net.GetOutput("GPUOutput"), 1e-3, 1e-4);
}
}
} // namespace
......
......@@ -34,8 +34,9 @@
#include "mace/public/mace.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/depthwise_conv2d.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/buffer/depthwise_conv2d.h"
#include "mace/ops/opencl/image/depthwise_conv2d.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
......@@ -490,11 +491,27 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
public:
explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
} else {
mem_type = MemoryType::GPU_BUFFER;
kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
}
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
1,
OpenCLBufferType::DW_CONV2D_FILTER,
mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT);
......
......@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters,
}
if (DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
"Filter", {multiplier, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}, true);
} else {
net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
"Filter", {kernel_h, kernel_w, input_channels, multiplier}, true);
net.GetTensor("Filter")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, int32_t>(
"Bias", {input_channels * multiplier});
"Bias", {input_channels * multiplier}, true);
}
if (D == DeviceType::CPU) {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
......@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters,
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
net.Setup(D);
......
此差异已折叠。
......@@ -29,6 +29,7 @@
#include "mace/utils/utils.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/depthwise_deconv2d.h"
#endif // MACE_ENABLE_OPENCL
......@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
}
MaceStatus Run(OpContext *context) override {
......
......@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters,
}
net.AddRandomInput<D, float>("Filter",
{1, channels, kernel_h,
kernel_w});
if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::DW_CONV2D_FILTER);
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("group", channels)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("group", channels)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
}
kernel_w}, true);
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("group", channels)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.Setup(D);
......
此差异已折叠。
......@@ -26,6 +26,7 @@
#include "mace/core/tensor.h"
#include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/eltwise.h"
#endif // MACE_ENABLE_OPENCL
......@@ -1086,12 +1087,28 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1);
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::EltwiseKernel<T>(
type, coeff, scalar_input, scalar_input_index));
} else {
MACE_NOT_IMPLEMENTED;
}
// Transform filters
int input_size = operator_def_->input_size();
Workspace *ws = context->workspace();
for (int i = 0; i < input_size; ++i) {
if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) {
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input0 = this->Input(0);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -31,6 +31,7 @@
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/matmul.h"
#endif // MACE_ENABLE_OPENCL
......@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
public:
explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::MatMulKernel<T>);
} else {
MACE_NOT_IMPLEMENTED;
}
MACE_UNUSED(context);
MACE_NOT_IMPLEMENTED;
}
MaceStatus Run(OpContext *context) override {
Validate();
......
此差异已折叠。
此差异已折叠。
......@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform(
}
};
}
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MaceStatus::MACE_SUCCESS;
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册