提交 bfbe1a30 编写于 作者: 李寅

Merge branch 'unify-cpu-gpu' into 'master'

Unify cpu gpu

See merge request !877
...@@ -69,9 +69,9 @@ in one deployment file. ...@@ -69,9 +69,9 @@ in one deployment file.
- The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe). - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
If there are more than one tensors, use one line for a tensor. If there are more than one tensors, use one line for a tensor.
* - input_shapes * - input_shapes
- The shapes of the input tensors, in NHWC order. - The shapes of the input tensors, default is NHWC order.
* - output_shapes * - output_shapes
- The shapes of the output tensors, in NHWC order. - The shapes of the output tensors, default is NHWC order.
* - input_ranges * - input_ranges
- The numerical range of the input tensors' data, default [-1, 1]. It is only for test. - The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
* - validation_inputs_data * - validation_inputs_data
...@@ -84,6 +84,10 @@ in one deployment file. ...@@ -84,6 +84,10 @@ in one deployment file.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
* - input_data_types * - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
- [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - output_data_formats
- [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - limit_opencl_kernel_time * - limit_opencl_kernel_time
- [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0. - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
* - obfuscate * - obfuscate
......
# one yaml config file can contain multi device info
devices:
# The name of the device
nanopi:
# arm64 or armhf
target_abis: [arm64, armhf]
# device soc, you can get it from device manual
target_socs: RK3399
# device model full name
models: FriendlyElec Nanopi M4
# device ip address
address: 10.0.0.0
# login username
username: user
# login password, is required when you can login into device without password
password: 1234567
raspberry:
target_abis: [armv7l]
target_socs: BCM2837
models: Raspberry Pi 3 Model B Plus Rev 1.3
address: 10.0.0.1
username: user
password: 123456
...@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) ...@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC #undef MACE_GET_REPEATED_ARGUMENT_FUNC
bool IsQuantizedModel(const NetDef &net_def) {
return
ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
== 1;
}
} // namespace mace } // namespace mace
...@@ -55,6 +55,8 @@ class ProtoArgHelper { ...@@ -55,6 +55,8 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_; std::map<std::string, Argument> arg_map_;
}; };
bool IsQuantizedModel(const NetDef &def);
} // namespace mace } // namespace mace
#endif // MACE_CORE_ARG_HELPER_H_ #endif // MACE_CORE_ARG_HELPER_H_
...@@ -233,6 +233,11 @@ class Image : public BufferBase { ...@@ -233,6 +233,11 @@ class Image : public BufferBase {
} }
} }
inline DataType dtype() const {
MACE_CHECK_NOTNULL(buf_);
return data_type_;
}
void *buffer() { void *buffer() {
MACE_CHECK_NOTNULL(buf_); MACE_CHECK_NOTNULL(buf_);
return buf_; return buf_;
......
...@@ -34,7 +34,7 @@ class Device { ...@@ -34,7 +34,7 @@ class Device {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
virtual OpenCLRuntime *opencl_runtime() = 0; virtual OpenCLRuntime *opencl_runtime() = 0;
#endif #endif // MACE_ENABLE_OPENCL
virtual CPURuntime *cpu_runtime() = 0; virtual CPURuntime *cpu_runtime() = 0;
virtual Allocator *allocator() = 0; virtual Allocator *allocator() = 0;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/memory_optimizer.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <sstream>
#include <unordered_set>
#include "mace/core/arg_helper.h"
#include "mace/core/macros.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze", "ExpandDims"
};
return kReuseOp.count(op_type) == 1;
}
void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
if (tensor_ref_count_.count(tensor_name) == 0) {
tensor_ref_count_.emplace(tensor_name, 1);
} else {
tensor_ref_count_[tensor_name] += 1;
}
}
void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
if (tensor_ref_count_.count(op_def->input(i)) == 1) {
tensor_ref_count_[op_def->input(i)] += 1;
}
}
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (tensor_ref_count_.count(op_def->output(i)) == 0) {
tensor_ref_count_.emplace(op_def->output(i), 0);
}
}
}
MemoryBlock MemoryOptimizer::CreateMemoryBlock(
std::vector<int64_t> shape,
DataType dt,
mace::MemoryType mem_type) {
MemoryBlock block;
#ifdef MACE_ENABLE_OPENCL
if (mem_type == MemoryType::GPU_IMAGE) {
std::vector<size_t> image_shape;
if (shape.size() == 2) {
shape = {shape[0], 1, 1, shape[1]};
} else {
MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
}
OpenCLUtil::CalImage2DShape(shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
block.set_x(image_shape[0]);
block.set_y(image_shape[1]);
return block;
}
#endif // MACE_ENABLE_OPENCL
MACE_UNUSED(mem_type);
int64_t op_mem_size = std::accumulate(shape.begin(),
shape.end(),
GetEnumTypeSize(dt),
std::multiplies<int64_t>());
block.set_x(op_mem_size);
block.set_y(1);
return block;
}
void MemoryOptimizer::Optimize(
const mace::OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types) {
MACE_LATENCY_LOGGER(2, "Optimize memory");
if (op_def->output_size() != op_def->output_shape_size()) {
VLOG(1) << op_def->name()
<< ": the number of output shape "
<< "is not equal to the number of output";
return;
}
auto device = static_cast<DeviceType>(op_def->device_type());
DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
*op_def,
"T",
static_cast<int>(DT_FLOAT)));
MACE_CHECK(
op_def->output_type_size() == 0 ||
op_def->output_size() == op_def->output_type_size(),
"operator output size != operator output type size",
op_def->output_size(),
op_def->output_type_size());
DataType dt;
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) {
dt = op_def->output_type(i);
} else {
dt = op_dtype;
}
int best_mem_id = -1;
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (device == DeviceType::GPU) {
mem_type = mem_types.at(op_def->output(i));
}
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
MemoryBlock best_mem_block;
if (IsMemoryReuseOp(op_def->type())) {
if (tensor_mem_map_.count(op_def->input(0)) == 1) {
best_mem_id = tensor_mem_map_[op_def->input(0)].first;
}
} else {
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
int64_t best_added_mem_size = LLONG_MAX;
int64_t best_wasted_mem_size = LLONG_MAX;
int64_t old_mem_size = 0, new_mem_size = 0;
MemoryBlock new_mem_block;
for (auto idle_mem_id : idle_blocks_) {
if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
if (mem_type == MemoryType::GPU_IMAGE) {
// GPU Image could reuse memory with same data type only
if (mem_blocks_[idle_mem_id].data_type() != dt) {
continue;
}
old_mem_size =
mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
op_mem_block.x()));
new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
op_mem_block.y()));
new_mem_size = new_mem_block.x() * new_mem_block.y();
} else {
old_mem_size = mem_blocks_[idle_mem_id].x();
new_mem_size = std::max(op_mem_size, old_mem_size);
new_mem_block.set_x(new_mem_size);
}
int64_t added_mem_size = new_mem_size - old_mem_size;
int64_t wasted_mem_size = new_mem_size - op_mem_size;
// minimize add_mem_size; if best_mem_add_size is 0,
// then minimize waste_mem_size
if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
|| (best_added_mem_size == 0 &&
wasted_mem_size < best_wasted_mem_size)) {
best_mem_id = idle_mem_id;
best_added_mem_size = added_mem_size;
best_wasted_mem_size = wasted_mem_size;
best_mem_block = new_mem_block;
}
}
}
if (best_added_mem_size <= op_mem_size) {
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
mem_blocks_[best_mem_id] = best_mem_block;
idle_blocks_.erase(best_mem_id);
} else {
best_mem_id = static_cast<int>(mem_blocks_.size());
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
best_mem_block.set_x(op_mem_block.x());
best_mem_block.set_y(op_mem_block.y());
mem_blocks_.push_back(best_mem_block);
}
}
if (best_mem_id != -1) {
if (mem_ref_count_.count(best_mem_id) == 1) {
mem_ref_count_[best_mem_id] += 1;
} else {
mem_ref_count_[best_mem_id] = 1;
}
tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
}
}
// de-refer input tensors
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
auto &input_name = op_def->input(i);
if (tensor_ref_count_.count(input_name) == 1) {
tensor_ref_count_[input_name] -= 1;
if (tensor_ref_count_.at(input_name) == 0 &&
tensor_mem_map_.count(input_name) == 1) {
int mem_id = tensor_mem_map_.at(input_name).first;
mem_ref_count_[mem_id] -= 1;
if (mem_ref_count_.at(mem_id) == 0) {
idle_blocks_.insert(mem_id);
}
} else {
MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
}
}
}
}
const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
return mem_blocks_;
}
const std::unordered_map<std::string, std::pair<int, DataType>>&
MemoryOptimizer::tensor_mem_map() const {
return tensor_mem_map_;
}
std::string MemoryOptimizer::DebugInfo() const {
auto memory_type_to_str = [](const MemoryType type) -> std::string {
if (type == MemoryType::CPU_BUFFER) {
return "CPU_BUFFER";
} else if (type == MemoryType::GPU_BUFFER) {
return "GPU_BUFFER";
} else if (type == MemoryType::GPU_IMAGE) {
return "GPU_IMAGE";
} else {
return "UNKNOWN";
}
};
std::stringstream sstream;
sstream << "\n";
size_t block_size = mem_blocks_.size();
for (size_t i = 0; i < block_size; ++i) {
sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
<< " ";
if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
"[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
} else {
sstream << "[" << mem_blocks_[i].x() << "]";
}
sstream << "\n";
}
return sstream.str();
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
#define MACE_CORE_MEMORY_OPTIMIZER_H_
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/core/types.h"
namespace mace {
class MemoryBlock {
public:
inline void set_mem_id(int mem_id) {
mem_id_ = mem_id;
}
inline int mem_id() const {
return mem_id_;
}
inline void set_data_type(DataType data_type) {
data_type_ = data_type;
}
inline DataType data_type() const {
return data_type_;
}
inline void set_mem_type(MemoryType mem_type) {
mem_type_ = mem_type;
}
inline MemoryType mem_type() const {
return mem_type_;
}
inline void set_x(int64_t x) {
x_ = x;
}
inline int64_t x() const {
return x_;
}
inline void set_y(int64_t y) {
y_ = y;
}
inline int64_t y() const {
return y_;
}
private:
int mem_id_;
DataType data_type_;
MemoryType mem_type_;
int64_t x_;
int64_t y_;
};
class MemoryOptimizer {
public:
static bool IsMemoryReuseOp(const std::string &op_type);
void UpdateTensorRef(const std::string &tensor_name);
void UpdateTensorRef(const OperatorDef *op_def);
void Optimize(const OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types);
const std::vector<MemoryBlock> &mem_blocks() const;
const std::unordered_map<std::string,
std::pair<int, DataType>> &tensor_mem_map() const;
std::string DebugInfo() const;
private:
MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
DataType dt,
MemoryType mem_type);
private:
std::unordered_map<std::string, int> tensor_ref_count_;
std::vector<MemoryBlock> mem_blocks_;
// tensor name : <mem_id, data_type>
// Buffer Memory do not different data type, so store the data type.
std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
std::unordered_map<int, int> mem_ref_count_;
std::set<int> idle_blocks_;
};
} // namespace mace
#endif // MACE_CORE_MEMORY_OPTIMIZER_H_
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -25,13 +26,94 @@ ...@@ -25,13 +26,94 @@
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat data_format_flag,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
// Get available devices
auto available_devices = op_registry->AvailableDevices(op_def->type());
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (data_format_flag == NHWC &&
op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
construct_context->set_operator_def(op_def);
std::unique_ptr<Operation> op(
op_registry->CreateOperation(construct_context, device_type));
return std::move(op);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry, SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def, const NetDef *net_def,
Workspace *ws, Workspace *ws,
Device *target_device, Device *target_device,
const NetMode mode) MemoryOptimizer *mem_optimizer)
: NetBase(), : NetBase(),
ws_(ws), ws_(ws),
target_device_(target_device), target_device_(target_device),
...@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, ...@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(), target_device->cpu_runtime()->policy(),
target_device->cpu_runtime()->use_gemmlowp())) { target_device->cpu_runtime()->use_gemmlowp())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// Create Operations // output tensor : related information
DeviceType target_device_type = target_device_->device_type(); std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_map<std::string, std::string> transformed_map;
// add input information
MemoryType target_mem_type;
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
//
DataFormat data_format_flag = NHWC;
if (target_device_->device_type() == DeviceType::CPU) {
target_mem_type = MemoryType::CPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// Only could be NONE or NHWC
auto input_data_format = static_cast<DataFormat>(
input_info.data_format());
if (!is_quantize_model &&
input_data_format == NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
} else if (input_data_format == DataFormat::DF_NONE) {
data_format_flag = DataFormat::DF_NONE;
}
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#ifdef MACE_ENABLE_OPENCL
else { // GPU NOLINT[readability/braces]
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_); OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create the Operation // Create operation
const int op_device = auto op = CreateOperation(op_registry,
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( &construct_context,
operator_def, "device", static_cast<int>(target_device_type)); op_def,
if (op_device == target_device_type) { data_format_flag,
// Get available devices (sorted based on priority) is_quantize_model);
OperatorDef temp_def(operator_def); #ifdef MACE_ENABLE_OPENCL
auto available_devices = op_registry->AvailableDevices(temp_def.type()); // Add input transform operation if necessary
// Find the device type to run the op. if (target_device_->device_type() == DeviceType::GPU) {
// If the target_device_type in available devices, use target_device_type, const DataType dt =
// otherwise, fallback to CPU device. static_cast<DataType>(
DeviceType device_type = DeviceType::CPU; ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
construct_context.set_device(cpu_device_); *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
for (auto device : available_devices) { // the outputs' memory type of the operation
if (device == target_device_type) { MemoryType out_mem_type = construct_context.output_mem_type();
device_type = target_device_type; int input_size = op_def->input_size();
construct_context.set_device(target_device_); for (int i = 0; i < input_size; ++i) {
break; if (output_map.count(op_def->input(i)) == 1) {
// if op is memory-reuse op, no transformation
if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
out_mem_type = output_map.at(op_def->input(i)).mem_type;
break;
}
// check whether is the output tensor of other operation
if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
output_map.at(op_def->input(i)).dtype != dt) {
auto key = TransformedName(op_def->input(i), out_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_map.count(key) == 0) {
VLOG(1) << "Add Transform operation to transform tensor '"
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to " << out_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< dt;
std::string input_name = op_def->input(i);
std::string t_input_name =
TransformedName(input_name,
out_mem_type);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
}
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
input_name, input_shape, t_input_name,
dt, out_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
data_format_flag);
operators_.emplace_back(std::move(transform_op));
transformed_map.emplace(key, t_input_name);
output_mem_map[t_input_name] = out_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, transformed_map[key]);
}
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
} }
} }
temp_def.set_device_type(device_type); // update the map : output_tensor -> Operation
construct_context.set_operator_def(&temp_def); for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
std::unique_ptr<Operation> op( output_mem_map[op_def->output(out_idx)] = out_mem_type;
op_registry->CreateOperation(&construct_context, device_type, mode)); output_map.emplace(
if (op) { op_def->output(out_idx),
operators_.emplace_back(std::move(op)); InternalOutputInfo(
out_mem_type,
dt,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
} }
} }
#endif // MACE_ENABLE_OPENCL
operators_.emplace_back(std::move(op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) {
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != DataType::DT_FLOAT) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << DataType::DT_FLOAT;
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
auto output_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
DataType::DT_FLOAT,
target_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
}
}
}
#endif // MACE_ENABLE_OPENCL
// Update output tensor reference
for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name());
}
// Do memory optimization
for (auto &op : operators_) {
VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
<< ", " << op->debug_def().type() << ">";
mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
} }
VLOG(1) << mem_optimizer->DebugInfo();
} }
MaceStatus SerialNet::Init() { MaceStatus SerialNet::Init() {
// TODO(liuqi): where to do memory reuse.
MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
OpInitContext init_context(ws_); OpInitContext init_context(ws_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
...@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() { ...@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() {
} }
MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
// TODO(liuqi): In/Out Buffer Transform
MACE_MEMORY_LOGGING_GUARD(); MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
OpContext context(ws_, cpu_device_); OpContext context(ws_, cpu_device_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter; auto &op = *iter;
DeviceType device_type = op->device_type(); DeviceType device_type = op->device_type();
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
"<", device_type, ", ", op->debug_def().type(), ">", "<", device_type, ", ", op->debug_def().type(),
". mem_id: ", ", ",
MakeListString(op->debug_def().mem_id().data(), ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op->debug_def().mem_id().size())); op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
">");
if (device_type == target_device_->device_type()) { if (device_type == target_device_->device_type()) {
context.set_device(target_device_); context.set_device(target_device_);
} else { } else {
...@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
float max_v = std::numeric_limits<float>::lowest(); float max_v = std::numeric_limits<float>::lowest();
float min_v = std::numeric_limits<float>::max(); float min_v = std::numeric_limits<float>::max();
Tensor::MappingGuard guard(op->Output(i)); Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>(); auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) { for (index_t j = 0; j < op->Output(i)->size(); ++j) {
max_v = std::max(max_v, output_data[j]); max_v = std::max(max_v, output_data[j]);
min_v = std::min(min_v, output_data[j]); min_v = std::min(min_v, output_data[j]);
...@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
std::vector<int> bin_distribution(bin_size, 0); std::vector<int> bin_distribution(bin_size, 0);
float bin_v = (max_v - min_v) / bin_size; float bin_v = (max_v - min_v) / bin_size;
Tensor::MappingGuard guard(op->Output(i)); Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>(); auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) { for (index_t j = 0; j < op->Output(i)->size(); ++j) {
int ind = static_cast<int>((output_data[j] - min_v) / bin_v); int index = static_cast<int>((output_data[j] - min_v) / bin_v);
if (ind < 0) if (index < 0)
ind = 0; index = 0;
else if (ind > bin_size-1) else if (index > bin_size-1)
ind = bin_size-1; index = bin_size-1;
bin_distribution[ind]++; bin_distribution[index]++;
} }
LOG(INFO) << "Tensor range @@" << op->debug_def().output(i) LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
<< "@@" << min_v << "," << max_v<< "@@" << "@@" << min_v << "," << max_v<< "@@"
......
...@@ -27,6 +27,7 @@ namespace mace { ...@@ -27,6 +27,7 @@ namespace mace {
class RunMetadata; class RunMetadata;
class Workspace; class Workspace;
class MemoryOptimizer;
class NetBase { class NetBase {
public: public:
...@@ -47,12 +48,20 @@ class SerialNet : public NetBase { ...@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
const NetDef *net_def, const NetDef *net_def,
Workspace *ws, Workspace *ws,
Device *target_device, Device *target_device,
const NetMode mode = NetMode::NORMAL); MemoryOptimizer * mem_optimizer);
MaceStatus Init() override; MaceStatus Init() override;
MaceStatus Run(RunMetadata *run_metadata = nullptr) override; MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat input_format,
bool is_quantize_model = false);
protected: protected:
Workspace *ws_; Workspace *ws_;
Device *target_device_; Device *target_device_;
......
...@@ -23,16 +23,12 @@ namespace mace { ...@@ -23,16 +23,12 @@ namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws) OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr), ws_(ws), device_(nullptr) {} : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
OpConstructContext::OpConstructContext(OperatorDef *operator_def,
Workspace *ws,
Device *device)
: operator_def_(operator_def), ws_(ws), device_(device) {}
OpInitContext::OpInitContext(Workspace *ws, Device *device) OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {} : ws_(ws), device_(device) {}
Operation::Operation(OpConstructContext *context) Operation::Operation(OpConstructContext *context)
: operator_def_(std::make_shared<OperatorDef>(*(context->operator_def()))) : operator_def_(context->operator_def())
{} {}
MaceStatus Operation::Init(OpInitContext *context) { MaceStatus Operation::Init(OpInitContext *context) {
...@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) { ...@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
": Encountered a non-existing input tensor: ", input_str); ": Encountered a non-existing input tensor: ", input_str);
inputs_.push_back(tensor); inputs_.push_back(tensor);
} }
// TODO(liuqi): filter transform
for (int i = 0; i < operator_def_->output_size(); ++i) { for (int i = 0; i < operator_def_->output_size(); ++i) {
const std::string output_str = operator_def_->output(i); const std::string output_str = operator_def_->output(i);
if (ws->HasTensor(output_str)) { if (ws->HasTensor(output_str)) {
// TODO(liuqi): Workspace should pre-allocate all of the output tensors
outputs_.push_back(ws->GetTensor(output_str)); outputs_.push_back(ws->GetTensor(output_str));
} else { } else {
MACE_CHECK( MACE_CHECK(
...@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) { ...@@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) {
} }
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, context->device()->allocator(), output_type))); output_str, context->device()->allocator(), output_type)));
}
if (i < operator_def_->output_shape_size()) { if (i < operator_def_->output_shape_size()) {
std::vector<index_t> std::vector<index_t>
shape_configured(operator_def_->output_shape(i).dims_size()); shape_configured(operator_def_->output_shape(i).dims_size());
for (size_t dim = 0; dim < shape_configured.size(); ++dim) { for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
shape_configured[dim] = operator_def_->output_shape(i).dims(dim); shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
}
ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
} }
ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
} }
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
...@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices( ...@@ -164,33 +157,34 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
std::unique_ptr<Operation> OpRegistryBase::CreateOperation( std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type, DeviceType device_type) const {
const NetMode mode) const { auto operator_def = context->operator_def();
OperatorDef *operator_def = context->operator_def(); DataType dtype = static_cast<DataType>(
const DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT))); *operator_def, "T", static_cast<int>(DT_FLOAT)));
const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( if (device_type == DeviceType::CPU && dtype == DT_HALF) {
*operator_def, "mode", static_cast<int>(NetMode::NORMAL)); int arg_size = operator_def->arg_size();
const NetMode op_mode = static_cast<NetMode>(op_mode_i); for (int i = 0; i < arg_size; ++i) {
VLOG(3) << "Creating operator " << operator_def->name() << "(" if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on " << operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type; << device_type;
if (op_mode == mode) { const std::string op_type = context->operator_def()->type();
const std::string op_type = context->operator_def()->type(); MACE_CHECK(registry_.count(op_type) != 0,
MACE_CHECK(registry_.count(op_type) != 0, op_type, " operation is not registered.");
op_type, " operation is not registered.");
std::string key = OpKeyBuilder(op_type)
std::string key = OpKeyBuilder(op_type) .Device(device_type)
.Device(device_type) .TypeConstraint("T", dtype)
.TypeConstraint("T", dtype) .Build();
.Build(); if (registry_.at(op_type)->creators.count(key) == 0) {
if (registry_.at(op_type)->creators.count(key) == 0) { LOG(FATAL) << "Key not registered: " << key;
LOG(FATAL) << "Key not registered: " << key;
}
return registry_.at(op_type)->creators.at(key)(context);
} else {
return nullptr;
} }
return registry_.at(op_type)->creators.at(key)(context);
} }
} // namespace mace } // namespace mace
...@@ -33,14 +33,13 @@ namespace mace { ...@@ -33,14 +33,13 @@ namespace mace {
class OpConstructContext { class OpConstructContext {
public: public:
explicit OpConstructContext(Workspace *ws); explicit OpConstructContext(Workspace *ws);
OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
~OpConstructContext() = default; ~OpConstructContext() = default;
inline void set_operator_def(OperatorDef *operator_def) { inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
operator_def_ = operator_def; operator_def_ = operator_def;
} }
inline OperatorDef *operator_def() const { inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_; return operator_def_;
} }
...@@ -56,10 +55,19 @@ class OpConstructContext { ...@@ -56,10 +55,19 @@ class OpConstructContext {
return device_; return device_;
} }
inline void set_output_mem_type(MemoryType type) {
output_mem_type_ = type;
}
inline MemoryType output_mem_type() const {
return output_mem_type_;
}
private: private:
OperatorDef *operator_def_; std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_; Workspace *ws_;
Device *device_; Device *device_;
MemoryType output_mem_type_; // used for transform memory
}; };
// memory_optimizer, device // memory_optimizer, device
...@@ -131,14 +139,18 @@ class Operation { ...@@ -131,14 +139,18 @@ class Operation {
} }
inline void set_debug_def( inline void set_debug_def(
const std::shared_ptr<const OperatorDef> &operator_def) { const std::shared_ptr<OperatorDef> &operator_def) {
operator_def_ = operator_def; operator_def_ = operator_def;
} }
inline bool has_debug_def() const { return operator_def_ != nullptr; } inline bool has_debug_def() const { return operator_def_ != nullptr; }
inline std::shared_ptr<OperatorDef> operator_def() {
return operator_def_;
}
protected: protected:
std::shared_ptr<const OperatorDef> operator_def_; std::shared_ptr<OperatorDef> operator_def_;
std::vector<const Tensor *> inputs_; std::vector<const Tensor *> inputs_;
std::vector<Tensor *> outputs_; std::vector<Tensor *> outputs_;
...@@ -190,8 +202,7 @@ class OpRegistryBase { ...@@ -190,8 +202,7 @@ class OpRegistryBase {
std::unique_ptr<Operation> CreateOperation( std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type, DeviceType device_type) const;
const NetMode mode) const;
template <class DerivedType> template <class DerivedType>
static std::unique_ptr<Operation> DefaultCreator( static std::unique_ptr<Operation> DefaultCreator(
......
...@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
is_profiling_enabled_(false), is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN), opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN), gpu_type_(UNKNOWN),
mem_type_(MemoryType::GPU_IMAGE) { mem_type_(MemoryType::GPU_IMAGE),
scratch_image_manager_(new ScratchImageManager) {
std::vector<cl::Platform> all_platforms; std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms); cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) { if (all_platforms.size() == 0) {
...@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const { ...@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
return is_profiling_enabled_; return is_profiling_enabled_;
} }
ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
return scratch_image_manager_.get();
}
} // namespace mace } // namespace mace
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "mace/core/file_storage.h" #include "mace/core/file_storage.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/scratch_image.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
...@@ -82,6 +83,7 @@ class OpenCLRuntime { ...@@ -82,6 +83,7 @@ class OpenCLRuntime {
uint64_t device_global_mem_cache_size() const; uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const; uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner(); Tuner<uint32_t> *tuner();
ScratchImageManager *scratch_image_manager() const;
bool is_opencl_avaliable(); bool is_opencl_avaliable();
// TODO(liuqi): remove this function in the future, make decision at runtime. // TODO(liuqi): remove this function in the future, make decision at runtime.
bool UseImageMemory(); bool UseImageMemory();
...@@ -134,6 +136,7 @@ class OpenCLRuntime { ...@@ -134,6 +136,7 @@ class OpenCLRuntime {
OpenCLVersion opencl_version_; OpenCLVersion opencl_version_;
GPUType gpu_type_; GPUType gpu_type_;
MemoryType mem_type_; MemoryType mem_type_;
std::unique_ptr<ScratchImageManager> scratch_image_manager_;
// All OpenCL object must be a pointer and manually deleted before unloading // All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library. // OpenCL library.
std::shared_ptr<cl::Context> context_; std::shared_ptr<cl::Context> context_;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_util.h"
#include <utility>
#include "mace/utils/logging.h"
namespace mace {
namespace {
// [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic, H * W * (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1];
(*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* MIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[1]);
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[0]);
(*image_shape)[1] = 1;
}
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> *image_shape,
const int blk_size) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[2] * shape[3];
(*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic * H * W, (Oc + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[0]);
}
// [(Ic + 3) / 4 * H * W, Oc]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
(*image_shape)[1] = shape[0];
}
} // namespace
void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_block_size) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) {
case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape);
break;
case DW_CONV2D_FILTER:
CalDepthwiseConv2dFilterImageShape(shape, image_shape);
break;
case IN_OUT_CHANNEL:
CalInOutputImageShape(shape, image_shape);
break;
case ARGUMENT:
CalArgImageShape(shape, image_shape);
break;
case IN_OUT_HEIGHT:
CalInOutHeightImageShape(shape, image_shape);
break;
case IN_OUT_WIDTH:
CalInOutWidthImageShape(shape, image_shape);
break;
case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
break;
case WEIGHT_HEIGHT:
CalWeightHeightImageShape(shape, image_shape);
break;
case WEIGHT_WIDTH:
CalWeightWidthImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const mace::MemoryType mem_type) {
std::unique_ptr<OperatorDef> op(new OperatorDef);
std::string op_name = "mace_node_" + output_name;
op->set_name(op_name);
op->set_type("BufferTransform");
op->add_input(input_name);
op->add_output(output_name);
Argument *arg = op->add_arg();
arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
arg = op->add_arg();
arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg();
arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg();
arg->set_name("device");
arg->set_i(DeviceType::GPU);
if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape();
for (auto value : input_shape) {
shape->add_dims(value);
}
}
return std::move(op);
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#include <memory>
#include <string>
#include <vector>
#include "mace/core/types.h"
namespace mace {
enum OpenCLBufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
class OpenCLUtil {
public:
static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const MemoryType mem_type);
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/scratch_image.h"
#include <utility>
#include <vector>
namespace mace {
ScratchImageManager::ScratchImageManager() = default;
ScratchImageManager::~ScratchImageManager() = default;
Image *ScratchImageManager::Spawn(
Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id) {
// TODO(liuqi): not optimal memory reuse strategy
int found_image_idx = -1;
int image_count = static_cast<int>(reference_count_.size());
for (int i = 0; i < image_count; ++i) {
int count = reference_count_[i];
if (count == 0 && images_.at(count)->dtype() == dt) {
auto image_shape = images_.at(count)->image_shape();
if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
found_image_idx = i;
break;
}
}
}
// if not found
if (found_image_idx == -1) {
reference_count_.push_back(0);
images_[image_count] =
std::move(std::unique_ptr<Image>(new Image(allocator)));
if (images_.at(image_count)->Allocate(shape, dt) !=
MaceStatus::MACE_SUCCESS) {
return nullptr;
}
found_image_idx = image_count;
VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
<< "<" << dt << ">";
}
reference_count_[found_image_idx] += 1;
*id = found_image_idx;
return images_.at(found_image_idx).get();
}
void ScratchImageManager::Deactive(int id) {
MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
&& reference_count_[id] > 0,
"Image id ", id, " exceed the vector size ",
reference_count_.size());
reference_count_[id] -= 1;
}
ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
: manager_(manager), id_(-1) {}
ScratchImage::~ScratchImage() {
if (id_ >= 0) {
manager_->Deactive(id_);
}
}
Image* ScratchImage::Scratch(Allocator *allocator,
const std::vector<size_t> &shape,
const mace::DataType dt) {
return manager_->Spawn(allocator, shape, dt, &id_);
}
} // namespace mace
...@@ -12,39 +12,47 @@ ...@@ -12,39 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ #ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ #define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#include <memory>
#include <unordered_map>
#include <vector> #include <vector>
#include "mace/public/mace.h" #include "mace/core/buffer.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
class OpContext; class ScratchImageManager {
class Tensor; public:
ScratchImageManager();
~ScratchImageManager();
Image *Spawn(Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id);
namespace ops { void Deactive(int id);
class OpenCLWinogradTransformKernel { private:
public: std::unordered_map<int, std::unique_ptr<Image>> images_;
virtual MaceStatus Compute( std::vector<int> reference_count_;
OpContext *context,
const Tensor *input,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
}; };
class OpenCLWinogradInverseTransformKernel { class ScratchImage {
public: public:
virtual MaceStatus Compute( explicit ScratchImage(ScratchImageManager *);
OpContext *context, ~ScratchImage();
const std::vector<const Tensor*> &inputs,
Tensor *output) = 0; Image *Scratch(Allocator *allocator,
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); const std::vector<size_t> &shape,
const DataType dt);
private:
ScratchImageManager *manager_;
int id_;
}; };
} // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#endif // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
...@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { ...@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
} }
} // namespace numerical_chars } // namespace numerical_chars
enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 }; enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
class Tensor { class Tensor {
public: public:
...@@ -222,6 +222,25 @@ class Tensor { ...@@ -222,6 +222,25 @@ class Tensor {
return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image(); return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
} }
inline MemoryType memory_type() const {
MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
if (buffer_->OnHost()) {
return MemoryType::CPU_BUFFER;
} else if (typeid(*buffer_) == typeid(Image)) {
return MemoryType::GPU_IMAGE;
} else {
return MemoryType::GPU_BUFFER;
}
}
inline void set_data_format(DataFormat data_format) {
data_format_ = data_format;
}
inline DataFormat data_format() const {
return data_format_;
}
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
inline cl::Image *opencl_image() const { inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), name_, " do not have image"); MACE_CHECK(has_opencl_image(), name_, " do not have image");
...@@ -488,6 +507,7 @@ class Tensor { ...@@ -488,6 +507,7 @@ class Tensor {
int32_t zero_point_; int32_t zero_point_;
float minval_; float minval_;
float maxval_; float maxval_;
DataFormat data_format_; // used for 4D input/output tensor
MACE_DISABLE_COPY_AND_ASSIGN(Tensor); MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
}; };
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <utility> #include <utility>
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/memory_optimizer.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -27,13 +28,6 @@ ...@@ -27,13 +28,6 @@
namespace mace { namespace mace {
namespace { namespace {
bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
static const std::unordered_set<std::string> reuse_buffer_ops {
"Reshape", "Identity", "Squeeze"
};
return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
}
bool HasQuantizeOp(const NetDef &net_def) { bool HasQuantizeOp(const NetDef &net_def) {
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
if (op.type() == "Quantize") { if (op.type() == "Quantize") {
...@@ -48,13 +42,14 @@ Workspace::Workspace() = default; ...@@ -48,13 +42,14 @@ Workspace::Workspace() = default;
Tensor *Workspace::CreateTensor(const std::string &name, Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
DataType type) { DataType type,
bool is_weight) {
if (HasTensor(name)) { if (HasTensor(name)) {
VLOG(3) << "Tensor " << name << " already exists. Skipping."; VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else { } else {
VLOG(3) << "Creating Tensor " << name; VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type, tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
false, name)); is_weight, name));
} }
return GetTensor(name); return GetTensor(name);
} }
...@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
fused_buffer_ = true; fused_buffer_ = true;
} }
} }
return MaceStatus::MACE_SUCCESS;
}
if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) { MaceStatus Workspace::PreallocateOutputTensor(
MaceStatus status = CreateOutputTensorBuffer(net_def, device); const mace::NetDef &net_def,
if (status != MaceStatus::MACE_SUCCESS) return status; const mace::MemoryOptimizer *mem_optimizer,
Device *device) {
auto &mem_blocks = mem_optimizer->mem_blocks();
for (auto &mem_block : mem_blocks) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", memory type: " << mem_block.mem_type()
<< ", size: " << mem_block.x() << "x" << mem_block.y();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{static_cast<size_t>(mem_block.x()),
static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
VLOG(1) << "Preallocate buffer to tensors";
bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
tensor_mem.second.second,
false, tensor_mem.first));
if (mem_blocks[tensor_mem.second.first].mem_type()
== MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
if (mem_blocks[tensor_mem.second.first].mem_type()
== MemoryType::GPU_BUFFER ||
is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else {
tensor->set_data_format(DataFormat::NCHW);
}
}
tensor_map_[tensor_mem.first] = std::move(tensor);
} }
if (device_type == DeviceType::CPU) { // add quantize info for output tensors.
if (device->device_type() == DeviceType::CPU) {
for (const auto &op : net_def.op()) { for (const auto &op : net_def.op()) {
VLOG(2) << "Add quantize info for op: " << op.name(); VLOG(2) << "Add quantize info for op: " << op.name();
MACE_CHECK(op.quantize_info().empty() MACE_CHECK(op.quantize_info().empty()
...@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
Device *device) {
DeviceType device_type = device->device_type();
DataType dtype = DataType::DT_INVALID;
if (net_def.mem_arena().mem_block_size() > 0) {
// We use the data type of the first op with mem id,
// as CPU&GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type && !op.mem_id().empty()) {
const DataType op_dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "T", static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype;
// find first valid data type, break
break;
}
}
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
}
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (mem_block.device_type() == device_type) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", device type: " << mem_block.device_type()
<< ", memory type: " << mem_block.mem_type();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetCPUAllocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) {
std::unique_ptr<BufferBase> image_buf(
new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype)
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf));
}
}
}
VLOG(3) << "Preallocate buffer to tensors";
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type) {
if (!op.mem_id().empty()
&& ShouldPreallocateMemoryForOp(op)) {
auto mem_ids = op.mem_id();
int count = mem_ids.size();
for (int i = 0; i < count; ++i) {
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = dtype;
}
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]),
output_type, false, op.output(i)));
if (device_type == DeviceType::GPU && tensor->has_opencl_image()) {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0]
<< ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1];
} else {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")"
<< " Mem: " << mem_ids[i]
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size();
}
tensor_map_[op.output(i)] = std::move(tensor);
}
} else {
for (int i = 0; i < op.output().size(); ++i) {
MACE_CHECK(
op.output_type_size() == 0
|| op.output_size()
== op.output_type_size(),
"operator output size != operator output type size",
op.output_size(),
op.output_type_size());
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
op, "T", static_cast<int>(DT_FLOAT)));
}
CreateTensor(op.output(i),
device->allocator(),
output_type);
}
}
for (int output_idx = 0; output_idx < op.output_shape_size();
++output_idx) {
std::vector<index_t>
shape_configured(op.output_shape(output_idx).dims_size());
for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
shape_configured[dim] = op.output_shape(output_idx).dims(dim);
}
tensor_map_[op.output(output_idx)]->SetShapeConfigured(
shape_configured);
}
}
}
return MaceStatus::MACE_SUCCESS;
}
void Workspace::RemoveUnusedBuffer() { void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin(); auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end(); auto end_iter = tensor_map_.end();
...@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, ...@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
tensor_buffer_.reset(nullptr); tensor_buffer_.reset(nullptr);
} }
void Workspace::RemoveTensor(const std::string &name) {
auto iter = tensor_map_.find(name);
if (iter != tensor_map_.end()) {
tensor_map_.erase(iter);
}
}
} // namespace mace } // namespace mace
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
namespace mace { namespace mace {
class MemoryOptimizer;
class Workspace { class Workspace {
public: public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap; typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
...@@ -36,7 +38,8 @@ class Workspace { ...@@ -36,7 +38,8 @@ class Workspace {
Tensor *CreateTensor(const std::string &name, Tensor *CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
DataType type); DataType type,
bool is_weight = false);
inline bool HasTensor(const std::string &name) const { inline bool HasTensor(const std::string &name) const {
return tensor_map_.find(name) != tensor_map_.end(); return tensor_map_.find(name) != tensor_map_.end();
...@@ -52,12 +55,19 @@ class Workspace { ...@@ -52,12 +55,19 @@ class Workspace {
Device *device, Device *device,
const unsigned char *model_data); const unsigned char *model_data);
MaceStatus PreallocateOutputTensor(const NetDef &net_def,
const MemoryOptimizer *mem_optimizer,
Device *device);
void RemoveUnusedBuffer(); void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def, void RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data, const unsigned char *model_data,
Allocator *alloc); Allocator *alloc);
void RemoveTensor(const std::string &name);
private: private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
Device *device); Device *device);
......
...@@ -20,9 +20,11 @@ ...@@ -20,9 +20,11 @@
#include <memory> #include <memory>
#include "mace/core/net.h"
#include "mace/core/device_context.h" #include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/ops/ops_registry.h" #include "mace/ops/ops_registry.h"
#include "mace/ops/transpose.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
// Check OpenCL avaliable // Check OpenCL avaliable
auto runtime = device->opencl_runtime(); auto runtime = device->opencl_runtime();
if (!runtime->is_opencl_avaliable()) { if (!runtime->is_opencl_avaliable()) {
LOG(WARNING) << "The device does not support OpenCL";
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
} }
...@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i); const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type); runtime->set_mem_type(mem_type);
if (mem_type == MemoryType::GPU_IMAGE) {
if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
auto opencl_max_image_size = runtime->GetMaxImage2DSize();
if (opencl_max_image_size.empty()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
const std::vector<int64_t> net_max_image_size =
ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
*net_def, "opencl_max_image_size", {0, 0});
if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
|| static_cast<uint64_t>(net_max_image_size[1])
> opencl_max_image_size[1]) {
LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
<< " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -288,14 +269,17 @@ class MaceTensor::Impl { ...@@ -288,14 +269,17 @@ class MaceTensor::Impl {
public: public:
std::vector<int64_t> shape; std::vector<int64_t> shape;
std::shared_ptr<float> data; std::shared_ptr<float> data;
DataFormat format;
}; };
MaceTensor::MaceTensor(const std::vector<int64_t> &shape, MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data) { std::shared_ptr<float> data,
const DataFormat format) {
MACE_CHECK_NOTNULL(data.get()); MACE_CHECK_NOTNULL(data.get());
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = shape; impl_->shape = shape;
impl_->data = data; impl_->data = data;
impl_->format = format;
} }
MaceTensor::MaceTensor() { MaceTensor::MaceTensor() {
...@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) { ...@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
} }
MaceTensor::MaceTensor(const MaceTensor &&other) { MaceTensor::MaceTensor(const MaceTensor &&other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
} }
MaceTensor &MaceTensor::operator=(const MaceTensor &other) { MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
return *this; return *this;
} }
MaceTensor &MaceTensor::operator=(const MaceTensor &&other) { MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
return *this; return *this;
} }
...@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; } ...@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
std::shared_ptr<float> MaceTensor::data() { return impl_->data; } std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
DataFormat MaceTensor::data_format() const {
return impl_->format;
}
// Mace Engine // Mace Engine
class MaceEngine::Impl { class MaceEngine::Impl {
public: public:
...@@ -355,6 +347,14 @@ class MaceEngine::Impl { ...@@ -355,6 +347,14 @@ class MaceEngine::Impl {
std::map<std::string, MaceTensor> *outputs, std::map<std::string, MaceTensor> *outputs,
RunMetadata *run_metadata); RunMetadata *run_metadata);
private:
MaceStatus TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor);
MaceStatus TransposeOutput(const Tensor *output_tensor,
std::pair<const std::string, MaceTensor> *output);
private: private:
const unsigned char *model_data_; const unsigned char *model_data_;
size_t model_data_size_; size_t model_data_size_;
...@@ -363,11 +363,12 @@ class MaceEngine::Impl { ...@@ -363,11 +363,12 @@ class MaceEngine::Impl {
std::unique_ptr<Device> device_; std::unique_ptr<Device> device_;
std::unique_ptr<Workspace> ws_; std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
std::map<std::string, mace::InputInfo> input_info_map_; bool is_quantized_model_;
std::map<std::string, mace::OutputInfo> output_info_map_;
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
std::unique_ptr<HexagonControlWrapper> hexagon_controller_; std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif #endif
std::map<std::string, mace::InputInfo> input_info_map_;
std::map<std::string, mace::OutputInfo> output_info_map_;
MACE_DISABLE_COPY_AND_ASSIGN(Impl); MACE_DISABLE_COPY_AND_ASSIGN(Impl);
}; };
...@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) ...@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
device_type_(config.impl_->device_type()), device_type_(config.impl_->device_type()),
device_(nullptr), device_(nullptr),
ws_(new Workspace()), ws_(new Workspace()),
net_(nullptr) net_(nullptr),
is_quantized_model_(false)
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
, hexagon_controller_(nullptr) , hexagon_controller_(nullptr)
#endif #endif
...@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get())); MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
} }
#endif #endif
// mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def);
// Get input and output information. // Get input and output information.
for (auto &input_info : net_def->input_info()) { for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info; input_info_map_[input_info.name()] = input_info;
...@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's inputs: " << "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_)); << MakeString(MapKeys(input_info_map_));
} }
ws_->CreateTensor(MakeString("mace_input_node_", input_name), ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
device_->allocator(), DT_FLOAT);
} }
for (auto output_name : output_nodes) { for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) { if (output_info_map_.find(output_name) == output_info_map_.end()) {
...@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's outputs " << "' does not belong to model's outputs "
<< MakeString(MapKeys(output_info_map_)); << MakeString(MapKeys(output_info_map_));
} }
ws_->CreateTensor(MakeString("mace_output_node_", output_name),
device_->allocator(), DT_FLOAT);
} }
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
if (device_type_ == HEXAGON) { if (device_type_ == HEXAGON) {
...@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
device_.get(), device_.get(),
model_data)); model_data));
MemoryOptimizer mem_optimizer;
// Init model // Init model
auto net = std::unique_ptr<NetBase>(new SerialNet(
op_registry_.get(),
net_def,
ws_.get(),
device_.get(),
NetMode::INIT));
MACE_RETURN_IF_ERROR(net->Init());
MACE_RETURN_IF_ERROR(net->Run());
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(), net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def, net_def,
ws_.get(), ws_.get(),
device_.get())); device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
&mem_optimizer,
device_.get()));
MACE_RETURN_IF_ERROR(net_->Init()); MACE_RETURN_IF_ERROR(net_->Init());
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
...@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() { ...@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
#endif #endif
} }
MaceStatus MaceEngine::Impl::TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor) {
if (device_->device_type() == DeviceType::CPU &&
input.second.shape().size() == 4 &&
input.second.data_format() == NHWC &&
!is_quantized_model_) {
VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
input_tensor->set_data_format(DataFormat::NCHW);
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else if (
(is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
input.second.shape().size() == 4 &&
input.second.data_format() == DataFormat::NCHW) {
VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
input_tensor->set_data_format(DataFormat::NHWC);
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else {
input_tensor->set_data_format(input.second.data_format());
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
}
MaceStatus MaceEngine::Impl::TransposeOutput(
const mace::Tensor *output_tensor,
std::pair<const std::string, mace::MaceTensor> *output) {
// save output
if (output_tensor != nullptr && output->second.data() != nullptr) {
if (device_->device_type() == DeviceType::CPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
MACE_CHECK(output_tensor->data_format() == NCHW);
VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else if (device_->device_type() == DeviceType::GPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
VLOG(1) << "Transform output " << output->first << " from "
<< output_tensor->data_format() << " to "
<< output->second.data_format();
std::vector<int> dst_dims = {0, 3, 1, 2};
if (output_tensor->data_format() == NCHW) {
dst_dims = {0, 2, 3, 1};
}
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else {
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
std::memcpy(output->second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
}
MaceStatus MaceEngine::Impl::Run( MaceStatus MaceEngine::Impl::Run(
const std::map<std::string, MaceTensor> &inputs, const std::map<std::string, MaceTensor> &inputs,
std::map<std::string, MaceTensor> *outputs, std::map<std::string, MaceTensor> *outputs,
...@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's inputs: " << "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_)); << MakeString(MapKeys(input_info_map_));
} }
Tensor *input_tensor = Tensor *input_tensor = ws_->GetTensor(input.first);
ws_->GetTensor(MakeString("mace_input_node_", input.first)); MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
}
input_tensors.push_back(input_tensor); input_tensors.push_back(input_tensor);
} }
for (auto &output : *outputs) { for (auto &output : *outputs) {
...@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's outputs: " << "' does not belong to model's outputs: "
<< MakeString(MapKeys(output_info_map_)); << MakeString(MapKeys(output_info_map_));
} }
Tensor *output_tensor = Tensor *output_tensor = ws_->GetTensor(output.first);
ws_->GetTensor(MakeString("mace_output_node_", output.first));
output_tensors.push_back(output_tensor); output_tensors.push_back(output_tensor);
} }
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
...@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
} }
#endif #endif
for (auto &output : *outputs) { for (auto &output : *outputs) {
Tensor *output_tensor = Tensor *output_tensor = ws_->GetTensor(output.first);
ws_->GetTensor(MakeString("mace_output_node_", output.first));
// save output // save output
if (output_tensor != nullptr && output.second.data() != nullptr) { MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output.second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(output.second.shape())
<< " != " << MakeString<int64_t>(shape);
std::memcpy(output.second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
......
...@@ -14,7 +14,6 @@ mace { ...@@ -14,7 +14,6 @@ mace {
*mace*NetDef*; *mace*NetDef*;
*mace*MemoryType*; *mace*MemoryType*;
*mace*DataType*; *mace*DataType*;
*mace*MemoryArena*;
*mace*InputInfo*; *mace*InputInfo*;
*mace*OutputInfo*; *mace*OutputInfo*;
*mace*OutputShape*; *mace*OutputShape*;
......
...@@ -30,10 +30,8 @@ cc_library( ...@@ -30,10 +30,8 @@ cc_library(
"arm/*_test.cc", "arm/*_test.cc",
"ops_registry.cc", "ops_registry.cc",
"ops_test_util.cc", "ops_test_util.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc", "buffer_transform.cc",
"lstm_cell.cc", "lstm_cell.cc",
"winograd_transform.cc",
"quantize.cc", "quantize.cc",
], ],
) + if_opencl_enabled(glob( ) + if_opencl_enabled(glob(
...@@ -41,10 +39,8 @@ cc_library( ...@@ -41,10 +39,8 @@ cc_library(
"opencl/*.cc", "opencl/*.cc",
"opencl/image/*.cc", "opencl/image/*.cc",
"opencl/buffer/*.cc", "opencl/buffer/*.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc", "buffer_transform.cc",
"lstm_cell.cc", "lstm_cell.cc",
"winograd_transform.cc",
], ],
exclude = [ exclude = [
"opencl/*_test.cc", "opencl/*_test.cc",
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/activation.h" #include "mace/ops/opencl/image/activation.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation { ...@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
"NOOP")); "NOOP"));
auto relux_max_limit = static_cast<T>( auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f)); Operation::GetOptionalArg<float>("max_limit", 0.0f));
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset( kernel_.reset(
new opencl::image::ActivationKernel<T>(type, relux_max_limit)); new opencl::image::ActivationKernel<T>(type, relux_max_limit));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (D == DeviceType::CPU) { OpDefBuilder("Activation", "ReluBM")
OpDefBuilder("Activation", "ReluBM") .Input("Input")
.Input("Input") .Output("Output")
.Output("Output") .AddStringArg("activation", "RELU")
.AddStringArg("activation", "RELU") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "ReluxBM")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "RELUX")
OpDefBuilder("Activation", "ReluxBM") .AddFloatArg("max_limit", 6.0)
.Input("InputImage") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("Output") .Finalize(net.NewOperatorDef());
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "ReluxBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, float>("Alpha", {channels}); net.AddRandomInput<D, T>("Alpha", {channels}, true);
if (D == DeviceType::CPU) { OpDefBuilder("Activation", "PreluBM")
OpDefBuilder("Activation", "PreluBM") .Input("Input")
.Input("Input") .Input("Alpha")
.Input("Alpha") .Output("Output")
.Output("Output") .AddStringArg("activation", "PRELU")
.AddStringArg("activation", "PRELU") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluBM")
.Input("InputImage")
.Input("AlphaImage")
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "TanhBM")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "TANH")
OpDefBuilder("Activation", "TanhBM") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Input("InputImage") .Finalize(net.NewOperatorDef());
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "TanhBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -310,27 +262,17 @@ void SigmoidBenchmark( ...@@ -310,27 +262,17 @@ void SigmoidBenchmark(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "SigmoidBM")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "SIGMOID")
OpDefBuilder("Activation", "SigmoidBM") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Input("InputImage") .Finalize(net.NewOperatorDef());
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "SigmoidBM")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -30,32 +30,14 @@ void TestSimpleRelu() { ...@@ -30,32 +30,14 @@ void TestSimpleRelu() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "ReluTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "RELU")
OpDefBuilder("Activation", "ReluTest") .Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output // Run
ImageToBuffer<D, float>(&net, "OutputImage", "Output", net.RunOp(D);
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() { ...@@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "ReluTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "RELU")
OpDefBuilder("Activation", "ReluTest") .Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output // Run
ImageToBuffer<D, float>(&net, "OutputImage", "Output", net.RunOp(D);
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
...@@ -129,34 +93,15 @@ void TestSimpleRelux() { ...@@ -129,34 +93,15 @@ void TestSimpleRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "ReluxTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "RELUX")
OpDefBuilder("Activation", "ReluxTest") .AddFloatArg("max_limit", 6)
.Input("InputImage") .Finalize(net.NewOperatorDef());
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -179,34 +124,15 @@ void TestSimpleReluRelux() { ...@@ -179,34 +124,15 @@ void TestSimpleReluRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "ReluxTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "RELUX")
OpDefBuilder("Activation", "ReluxTest") .AddFloatArg("max_limit", 6)
.Input("InputImage") .Finalize(net.NewOperatorDef());
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -232,45 +158,36 @@ void TestSimplePrelu() { ...@@ -232,45 +158,36 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}); net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("InputImage") .Input("Input")
.Input("AlphaImage") .Input("Alpha")
.Output("OutputImage") .Output("Output")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else { } else {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("Input") .Input("InputNCHW")
.Input("Alpha") .Input("Alpha")
.Output("Output") .Output("OutputNCHW")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} }
if (D == DeviceType::CPU) { auto expected = net.CreateTensor<float>(
auto expected = net.CreateTensor<float>( {2, 2, 2, 2},
{2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
{-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} }
} // namespace } // namespace
...@@ -290,32 +207,14 @@ void TestSimpleTanh() { ...@@ -290,32 +207,14 @@ void TestSimpleTanh() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "TanhTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "TANH")
OpDefBuilder("Activation", "TanhTest") .Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "TanhTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
...@@ -343,32 +242,14 @@ void TestSimpleSigmoid() { ...@@ -343,32 +242,14 @@ void TestSimpleSigmoid() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) { OpDefBuilder("Activation", "SigmoidTest")
BufferToImage<D, float>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Output("Output")
.AddStringArg("activation", "SIGMOID")
OpDefBuilder("Activation", "SigmoidTest") .Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "SigmoidTest")
.Input("Input")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
......
...@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c}); net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
} }
if (D == DeviceType::GPU) { OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(&net, MakeString("Input", i).c_str(), op_def_builder.Input(MakeString("Input", i).c_str());
MakeString("InputImage", i).c_str(),
ops::BufferType::IN_OUT_CHANNEL);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("Input", i).c_str());
}
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} }
op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -62,39 +62,15 @@ void SimpleAdd3() { ...@@ -62,39 +62,15 @@ void SimpleAdd3() {
net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
{-0.1582, 2, 3, 4, 5, 6}); {-0.1582, 2, 3, 4, 5, 6});
const int input_num = 4; OpDefBuilder("AddN", "AddNTest")
if (D == DeviceType::GPU) { .Input("Input0")
// run on gpu .Input("Input1")
for (int i = 0; i < input_num; ++i) { .Input("Input2")
BufferToImage<D, half>(&net, MakeString("Input", i), .Input("Input3")
MakeString("InputImage", i), .Output("Output")
ops::BufferType::IN_OUT_CHANNEL); .Finalize(net.NewOperatorDef());
} // Run
net.RunOp(D);
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("AddN", "AddNTest")
.Input("Input0")
.Input("Input1")
.Input("Input2")
.Input("Input3")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = auto expected =
net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
...@@ -138,28 +114,10 @@ void RandomTest() { ...@@ -138,28 +114,10 @@ void RandomTest() {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on device
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
MakeString("InputImage", i),
ops::BufferType::IN_OUT_CHANNEL);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2); 1e-2);
} }
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/activation.h" #include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/batch_norm.h" #include "mace/ops/opencl/image/batch_norm.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
ActivationType activation = ops::StringToActivationType( ActivationType activation = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", "NOOP")); Operation::GetOptionalArg<std::string>("activation", "NOOP"));
float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f); float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BatchNormKernel<T>( kernel_.reset(new opencl::image::BatchNormKernel<T>(
epsilon, activation, relux_max_limit)); epsilon, activation, relux_max_limit));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
// Transform filters
int input_size = operator_def_->input_size();
for (int i = 1; i < input_size; ++i) {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
bool not_folded = this->InputSize() == 5; bool not_folded = this->InputSize() == 5;
......
...@@ -36,13 +36,12 @@ void BatchNorm( ...@@ -36,13 +36,12 @@ void BatchNorm(
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, T>("Scale", {channels}); net.AddRandomInput<D, T>("Scale", {channels}, true);
net.AddRandomInput<D, T>("Offset", {channels}); net.AddRandomInput<D, T>("Offset", {channels}, true);
net.AddRandomInput<D, T>("Mean", {channels}); net.AddRandomInput<D, T>("Mean", {channels}, true);
net.AddRandomInput<D, T>("Var", {channels}, true); net.AddRandomInput<D, T>("Var", {channels}, true, true);
if (D == DeviceType::CPU) { OpDefBuilder("BatchNorm", "BatchNormBM")
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input") .Input("Input")
.Input("Scale") .Input("Scale")
.Input("Offset") .Input("Offset")
...@@ -50,30 +49,8 @@ void BatchNorm( ...@@ -50,30 +49,8 @@ void BatchNorm(
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("Output") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
......
...@@ -28,10 +28,10 @@ void Simple() { ...@@ -28,10 +28,10 @@ void Simple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}); net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
net.AddInputFromArray<D, float>("Offset", {1}, {2.0}); net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
net.AddInputFromArray<D, float>("Mean", {1}, {10}); net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}); net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
...@@ -49,32 +49,17 @@ void Simple() { ...@@ -49,32 +49,17 @@ void Simple() {
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} }
// Check // Check
...@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
...@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4); 1e-5, 1e-4);
} }
...@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2); 1e-1, 1e-2);
} }
...@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
...@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4); 1e-5, 1e-4);
} }
...@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2); 1e-1, 1e-2);
} }
......
...@@ -32,23 +32,13 @@ void BMBatchToSpace( ...@@ -32,23 +32,13 @@ void BMBatchToSpace(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::CPU) { OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("Input")
.Input("Input") .Output("Output")
.Output("Output") .AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("crops", {0, 0, 0, 0}) .AddIntsArg("block_shape", {arg, arg})
.AddIntsArg("block_shape", {arg, arg}) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/activation.h" #include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/bias_add.h" #include "mace/ops/opencl/image/bias_add.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
: Operation(context), : Operation(context),
data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>( data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
"data_format", NHWC))) { "data_format", NHWC))) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BiasAddKernel<T>); kernel_.reset(new opencl::image::BiasAddKernel<T>);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
DataFormat data_format = NHWC;
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
data_format = NCHW;
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, T>("Bias", {channels}, true); net.AddRandomInput<D, T>("Bias", {channels}, true, true);
if (D == DeviceType::CPU) { OpDefBuilder("BiasAdd", "BiasAddBM")
OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("Input") .Input("Input")
.Input("Bias") .Input("Bias")
.AddIntArg("data_format", NCHW) .AddIntArg("data_format", data_format)
.Output("Output") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("InputImage")
.Input("BiasImage")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -28,7 +28,7 @@ void BiasAddSimple() { ...@@ -28,7 +28,7 @@ void BiasAddSimple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
...@@ -44,22 +44,13 @@ void BiasAddSimple() { ...@@ -44,22 +44,13 @@ void BiasAddSimple() {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC); "Output", NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on gpu
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
...@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on gpu
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} // namespace test } // namespace test
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
#include "mace/ops/opencl/image/image_to_buffer.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BufferInverseTransformOp;
template <typename T>
class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
public:
explicit BufferInverseTransformOp(OpConstructContext *context)
: Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ImageToBuffer<T>);
} else {
kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
ops::BufferType type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
return kernel_->Compute(context, input, type,
wino_blk_size_, output);
}
private:
const int wino_blk_size_;
std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
};
void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, half);
}
} // namespace ops
} // namespace mace
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters, ...@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", net.AddRandomInput<D, T>("Input",
{out_channel, in_channel, height, width}); {out_channel, in_channel, height, width});
// Create output
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpDefBuilder("BufferToImage", "BufferToImageBM") auto transform_func = [&]() {
.Input("Input") OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Output("Output") .Transform(&context,
.Finalize(net.NewOperatorDef()); net.ws()->GetTensor("Input"),
OpenCLBufferType::IN_OUT_CHANNEL,
MemoryType::GPU_IMAGE,
0,
b2i_output);
};
// Warm-up // Warm-up
net.Setup(D); net.Setup(D);
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.Run(); transform_func();
} }
net.Sync(); net.Sync();
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.Run(); transform_func();
} }
net.Sync(); net.Sync();
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/opencl/buffer_transformer.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -21,31 +22,27 @@ namespace test { ...@@ -21,31 +22,27 @@ namespace test {
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestBidirectionTransform(const int type, void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", input_shape); net.AddRandomInput<D, T>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
net.RunOp(D); .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") // Inverse Transform
.Input("B2IOutput") Tensor *i2b_output = net.ws()->CreateTensor(
.Output("I2BOutput") "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
.AddIntArg("buffer_type", type) OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.AddIntArg("T", DataTypeToEnum<T>::value) .Transform(&context, b2i_output,
.Finalize(net.NewOperatorDef()); type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Run
net.RunOp(D);
// Check // Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type, ...@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgSmall) { TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{1});
} }
TEST(BufferToImageTest, ArgHalfSmall) { TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
{11});
} }
TEST(BufferToImageTest, ArgMedium) { TEST(BufferToImageTest, ArgMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{11});
} }
TEST(BufferToImageTest, ArgLarge) { TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{256});
} }
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{1, 2, 3, 1}); OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{1, 2, 3, 3}); OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 2, 3, 3}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedium) { TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 13, 17, 128}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 64, 64, 256}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{5, 3, 1, 1}); {5, 3, 1, 1});
} }
TEST(BufferToImageTest, Filter1x1Medium) { TEST(BufferToImageTest, Filter1x1Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{13, 17, 1, 1}); {13, 17, 1, 1});
} }
TEST(BufferToImageTest, Filter1x1Large) { TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{512, 128, 1, 1}); {512, 128, 1, 1});
} }
TEST(BufferToImageTest, Filter3x3Small) { TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{3, 5, 3, 3}); {3, 5, 3, 3});
} }
TEST(BufferToImageTest, Filter3x3Medium) { TEST(BufferToImageTest, Filter3x3Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{17, 13, 3, 3}); {17, 13, 3, 3});
} }
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{256, 128, 3, 3}); {256, 128, 3, 3});
} }
TEST(BufferToImageTest, WeightWidthSmall) { TEST(BufferToImageTest, WeightWidthSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
{1, 3, 3, 3}); OpenCLBufferType::WEIGHT_WIDTH,
{1, 3, 3, 3});
} }
TEST(BufferToImageTest, WeightWidthMedium) { TEST(BufferToImageTest, WeightWidthMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
{11, 13, 13, 17}); OpenCLBufferType::WEIGHT_WIDTH,
{11, 13, 13, 17});
} }
TEST(BufferToImageTest, WeightWidthLarge) { TEST(BufferToImageTest, WeightWidthLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
{64, 64, 11, 13}); OpenCLBufferType::WEIGHT_WIDTH,
{64, 64, 11, 13});
} }
TEST(BufferToImageTest, WeightHeightSmall) { TEST(BufferToImageTest, WeightHeightSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
{2, 1, 1, 1}); OpenCLBufferType::WEIGHT_HEIGHT,
{2, 1, 1, 1});
} }
TEST(BufferToImageTest, WeightHeightMedium) { TEST(BufferToImageTest, WeightHeightMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
{11, 13, 13, 17}); OpenCLBufferType::WEIGHT_HEIGHT,
{11, 13, 13, 17});
} }
TEST(BufferToImageTest, WeightHeightLarge) { TEST(BufferToImageTest, WeightHeightLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
{64, 16, 11, 13}); OpenCLBufferType::WEIGHT_HEIGHT,
{64, 16, 11, 13});
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", input_shape); net.AddRandomInput<D, float>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
net.RunOp(D); .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run // Inverse Transform
net.RunOp(D); Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type, ...@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgFloatToHalfSmall) { TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
{11}); OpenCLBufferType::ARGUMENT,
{11});
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestStringHalfBidirectionTransform(const int type, void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const unsigned char *input_data) { const unsigned char *input_data) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
const half *h_data = reinterpret_cast<const half *>(input_data); const half *h_data = reinterpret_cast<const half *>(input_data);
net.AddInputFromArray<D, half>("Input", input_shape, net.AddInputFromArray<D, half>("Input", input_shape,
std::vector<half>(h_data, h_data + 2)); std::vector<half>(h_data, h_data + 2));
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run // Transform
net.RunOp(D); OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") type, MemoryType::GPU_IMAGE, 0, b2i_output);
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Run // Inverse Transform
net.RunOp(D); Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check // Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
{2}, input_data); OpenCLBufferType::ARGUMENT, {2}, input_data);
} }
} // namespace test } // namespace test
......
...@@ -15,8 +15,7 @@ ...@@ -15,8 +15,7 @@
#include <memory> #include <memory>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_transform.h" #include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
public: public:
explicit BufferTransformOp(OpConstructContext *context) explicit BufferTransformOp(OpConstructContext *context)
: Operation(context), : Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) { wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
if (context->device()->opencl_runtime()->UseImageMemory()) { out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
kernel_.reset(new opencl::image::BufferToImage<T>); "mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}
} else {
kernel_.reset(new opencl::buffer::BufferTransform<T>);
}
}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
ops::BufferType type = auto type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>( static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER))); "buffer_type", static_cast<int>(CONV2D_FILTER)));
return kernel_->Compute(context, input, type, MemoryType in_mem_type = context->workspace()->GetTensor(
wino_blk_size_, output); operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
} }
private: private:
const int wino_blk_size_; const int wino_blk_size_;
std::unique_ptr<OpenCLBufferTransformKernel> kernel_; MemoryType out_mem_type_;
}; };
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <cstring> #include <cstring>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase { ...@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {
namespace { namespace {
template <typename OrgType, typename DstType> template <typename OrgType, typename DstType>
void TestBidirectionTransform(const int type, void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("TransformedOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<DstType>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape); net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
Tensor *bt_output = net.ws()->CreateTensor(
// Run "BtOutput", context.device()->allocator(),
net.RunOp(DeviceType::GPU); DataTypeToEnum<DstType>::value);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
.Input("TransformedOutput") MemoryType::GPU_BUFFER)
.Output("Output") .Transform(&context, net.ws()->GetTensor("Input"),
.AddIntArg("buffer_type", type) type, MemoryType::GPU_BUFFER, 0, bt_output);
.AddIntArg("T", DataTypeToEnum<OrgType>::value)
.Finalize(net.NewOperatorDef()); // Inverse Transform
Tensor *output = net.ws()->CreateTensor(
// Run "Output", context.device()->allocator(),
net.RunOp(DeviceType::GPU); DataTypeToEnum<OrgType>::value);
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, output);
if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) { if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
...@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type, ...@@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type,
} // namespace } // namespace
TEST_F(BufferTransformTest, FloatToHalf) { TEST_F(BufferTransformTest, FloatToHalf) {
TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL, TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4}); {1, 2, 3, 4});
} }
TEST_F(BufferTransformTest, HalfToHalf) {
TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
namespace { namespace {
template <typename T> template <typename T>
void TestArgumentTransform(const index_t input_size) { void TestArgumentTransform(const index_t input_size) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("Output")
.AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size}); net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
// Run // Run
net.RunOp(DeviceType::GPU); Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
0, output);
auto output_tensor = net.GetOutput("Output");
index_t expected_size = RoundUp<index_t>(input_size, 4); index_t expected_size = RoundUp<index_t>(input_size, 4);
EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]); EXPECT_EQ(expected_size, output->buffer_shape()[0]);
// Check // Check
ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor, ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
1e-3, 1e-4); 1e-3, 1e-4);
} }
} // namespace } // namespace
......
...@@ -36,23 +36,11 @@ void ChannelShuffle( ...@@ -36,23 +36,11 @@ void ChannelShuffle(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (D == DeviceType::CPU) { OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
OpDefBuilder("Softmax", "SoftmaxBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
"Input", {1, 1, 2, 16}, "Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage") .Input("Input")
.Output("OutputImage") .Output("Output")
.AddIntArg("group", 4) .AddIntArg("group", 4)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 16}, {1, 1, 2, 16},
......
...@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation { ...@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
public: public:
explicit ConcatOpBase(OpConstructContext *context) explicit ConcatOpBase(OpConstructContext *context)
: Operation(context), : Operation(context),
axis_(Operation::GetOptionalArg<int>("axis", 3)) {} axis_(Operation::GetOptionalArg<int>("axis", 3)),
checked_(false) {}
protected: protected:
void Validate() { void Validate() {
...@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation { ...@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {
protected: protected:
int axis_; int axis_;
bool checked_;
}; };
template <DeviceType D, class T> template <DeviceType D, class T>
...@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase { ...@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
Validate(); if (!checked_) {
Validate();
if (this->Input(0)->dim_size() == 4) {
if (axis_ == 3) axis_ = 1;
else if (axis_ == 2) axis_ = 3;
else if (axis_ == 1) axis_ = 2;
}
checked_ = true;
}
const std::vector<const Tensor *> &inputs = this->Inputs(); const std::vector<const Tensor *> &inputs = this->Inputs();
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
const Tensor *input0 = inputs.front(); const Tensor *input0 = inputs.front();
......
...@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128); ...@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);
namespace { namespace {
template <typename T> template <typename T>
void OpenclConcatHelper(int iters, void OpenCLConcatHelper(int iters,
const std::vector<index_t> &shape0, const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1, const std::vector<index_t> &shape1,
int concat_dim) { int concat_dim) {
...@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters, ...@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM") OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0") .Input("Input0")
.Input("InputImage1") .Input("Input1")
.AddIntArg("axis", concat_dim) .AddIntArg("axis", concat_dim)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters, ...@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\ static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
std::vector<index_t> shape = {N, H, W, C}; \ std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \ OpenCLConcatHelper<TYPE>(iters, shape, shape, 3); \
} \ } \
MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
......
...@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) { ...@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
int dim = 5; int dim = 5;
int num_inputs = 2 + rand_r(&seed) % 10; int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim; int axis = 1;
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
...@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
int dim = 4; int dim = 4;
int num_inputs = 2 + rand_r(&seed) % 10; int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim; int axis = 1;
int axis_arg = 3; // NHWC
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
std::vector<index_t> output_shape = input_shapes[0]; std::vector<index_t> output_shape = input_shapes[0];
output_shape[axis] = concat_axis_size; output_shape[axis] = concat_axis_size;
net.AddRandomInput<DeviceType::CPU, float>( net.AddRandomInput<DeviceType::CPU, float>(
"Output", output_shape, true, true); "Output", output_shape, false, true, true);
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
builder = builder.Input(MakeString("Input", i)); builder = builder.Input(MakeString("Input", i));
} }
builder.AddIntArg("axis", axis) builder.AddIntArg("axis", axis_arg)
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
net.RunOp(); net.RunOp();
net.AddRandomInput<DeviceType::CPU, uint8_t>( net.AddRandomInput<DeviceType::CPU, uint8_t>(
"QuantizedOutput", output_shape, true, true); "QuantizedOutput", output_shape, false, true, true);
auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest"); auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
q_builder = q_builder.Input(MakeString("QuantizedInput", i)); q_builder = q_builder.Input(MakeString("QuantizedInput", i));
...@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
OpsTestNet net; OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i); const std::string input_name = MakeString("Input", i);
const std::string image_name = MakeString("InputImage", i);
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]); GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i], net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
inputs[i]); inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
ops::BufferType::IN_OUT_CHANNEL);
} }
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string image_name = MakeString("InputImage", i); const std::string image_name = MakeString("Input", i);
builder = builder.Input(image_name); builder = builder.Input(image_name);
} }
builder.AddIntArg("axis", axis) builder.AddIntArg("axis", axis)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto output = net.GetOutput("Output"); auto output = net.GetOutput("Output");
......
...@@ -38,8 +38,9 @@ ...@@ -38,8 +38,9 @@
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/conv_2d.h" #include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/buffer/conv_2d.h" #include "mace/ops/opencl/buffer/conv_2d.h"
#include "mace/ops/opencl/image/conv_2d.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
...@@ -958,13 +959,45 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -958,13 +959,45 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
: ConvPool2dOpBase(context), : ConvPool2dOpBase(context),
activation_(ops::StringToActivationType( activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
"NOOP"))), "NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) { relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::Conv2dKernel<T>); kernel_.reset(new opencl::image::Conv2dKernel<T>);
} else { } else {
mem_type = MemoryType::GPU_BUFFER;
kernel_.reset(new opencl::buffer::Conv2dKernel<T>); kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
} }
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd(
context->device()->opencl_runtime(),
context->workspace()->GetTensor(
operator_def_->input(1))->shape(),
std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
operator_def_->output_shape(0).dims().end()),
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
return kernel_->Compute(context, input, filter, bias, return kernel_->Compute(context, input, filter, bias,
strides_.data(), padding_type_, paddings_, strides_.data(), padding_type_, paddings_,
dilations_.data(), activation_, relux_max_limit_, dilations_.data(), activation_, relux_max_limit_,
output); wino_block_size_, output);
} }
private: private:
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
std::unique_ptr<OpenCLConv2dKernel> kernel_; std::unique_ptr<OpenCLConv2dKernel> kernel_;
int wino_block_size_;
private: private:
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
......
...@@ -49,11 +49,10 @@ void Conv2d(int iters, ...@@ -49,11 +49,10 @@ void Conv2d(int iters,
} }
net.AddRandomInput<D, float>("Filter", net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h, {output_channels, channels, kernel_h,
kernel_w}); kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels}, true);
if (D == DeviceType::CPU) { OpDefBuilder("Conv2D", "Conv2dTest")
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
...@@ -63,26 +62,6 @@ void Conv2d(int iters, ...@@ -63,26 +62,6 @@ void Conv2d(int iters,
.AddIntsArg("dilations", {dilation, dilation}) .AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
net.Setup(D); net.Setup(D);
...@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters, ...@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.GetTensor("Input")->SetScale(0.1); net.GetTensor("Input")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, uint8_t>( net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Filter", {output_channels, kernel_h, kernel_w, channels}); "Filter", {output_channels, kernel_h, kernel_w, channels}, true);
net.GetTensor("Filter")->SetScale(0.1); net.GetTensor("Filter")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}); net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
......
此差异已折叠。
...@@ -24,7 +24,7 @@ namespace ops { ...@@ -24,7 +24,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape, void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
Padding padding, Padding padding,
...@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC ...@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
void CalcOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *padding_size, const int *padding_size,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
......
...@@ -35,7 +35,7 @@ namespace ops { ...@@ -35,7 +35,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape, void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
Padding padding, Padding padding,
...@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, ...@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
void CalcOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *padding_size, const int *padding_size,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
TEST(CoreTest, INIT_MODE) {
std::vector<OperatorDef> op_defs;
Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
std::unique_ptr<Tuner<uint32_t>> tuner;
Workspace ws;
op_defs.emplace_back(OperatorDef());
OpDefBuilder("BufferTransform", "BufferTransformTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
.AddIntArg("mode", static_cast<int>(NetMode::INIT))
.Finalize(&op_defs[op_defs.size() - 1]);
Tensor *input = ws.CreateTensor("Input", device->allocator(),
DataTypeToEnum<float>::v());
input->Resize({1, 3, 3, 3});
{
Tensor::MappingGuard input_mapper(input);
float *input_data = input->mutable_data<float>();
std::fill(input_data, input_data + input->size(), 1);
}
op_defs.emplace_back(OperatorDef());
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("Output")
.AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER)
.Finalize(&op_defs[op_defs.size() - 1]);
NetDef net_def;
for (auto &op_def : op_defs) {
net_def.add_op()->CopyFrom(op_def);
}
std::shared_ptr<OpRegistry> op_registry(new OpRegistry());
auto net = std::unique_ptr<NetBase>(new SerialNet(
op_registry.get(), &net_def, &ws, device,
NetMode::INIT));
MaceStatus status = net->Init();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
status = net->Run();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
net = std::unique_ptr<NetBase>(new SerialNet(
op_registry.get(), &net_def, &ws, device));
status = net->Init();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
status = net->Run();
MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
1e-5);
}
} // namespace test
} // namespace ops
} // namespace mace
...@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6); ...@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);
namespace { namespace {
template <typename T> template <typename T>
void OpenclCropHelper(int iters, void OpenCLCropHelper(int iters,
const std::vector<index_t> &shape0, const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1, const std::vector<index_t> &shape1,
int crop_axis, int crop_axis,
...@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters, ...@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Crop", "CropBM") OpDefBuilder("Crop", "CropBM")
.Input("InputImage0") .Input("Input0")
.Input("InputImage1") .Input("Input1")
.AddIntArg("axis", crop_axis) .AddIntArg("axis", crop_axis)
.AddIntsArg("offset", {offset}) .AddIntsArg("offset", {offset})
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters, ...@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
_##TYPE(int iters) { \ _##TYPE(int iters) { \
std::vector<index_t> shape0 = {N, H, W, C}; \ std::vector<index_t> shape0 = {N, H, W, C}; \
std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2}; \ std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2}; \
OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \ OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \
} \ } \
MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\ MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
##_##TYPE) ##_##TYPE)
......
...@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape, ...@@ -34,14 +34,10 @@ void RunCrop(const std::vector<index_t> &input_shape,
net.AddRandomInput<D, float>("Input1", input_shape2); net.AddRandomInput<D, float>("Input1", input_shape2);
if (D == GPU) { if (D == GPU) {
BufferToImage<D, float>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Crop", "CropTest") OpDefBuilder("Crop", "CropTest")
.Input("InputImage0") .Input("Input0")
.Input("InputImage1") .Input("Input1")
.Output("OutputImage") .Output("Output")
.AddIntsArg("offset", offset) .AddIntsArg("offset", offset)
.AddIntArg("axis", axis) .AddIntArg("axis", axis)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape, ...@@ -66,10 +62,7 @@ void RunCrop(const std::vector<index_t> &input_shape,
// Run // Run
net.RunOp(D); net.RunOp(D);
if (D == GPU) { if (D == CPU) {
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else if (D == CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC); "Output", NHWC);
} }
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "mace/ops/arm/deconv_2d_neon.h" #include "mace/ops/arm/deconv_2d_neon.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/deconv_2d.h" #include "mace/ops/opencl/image/deconv_2d.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { ...@@ -358,11 +359,27 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
public: public:
explicit Deconv2dOp(OpConstructContext *context) explicit Deconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::Deconv2dKernel<T>); kernel_.reset(new opencl::image::Deconv2dKernel<T>);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (model_type_ == FrameworkType::CAFFE) {
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} else if (operator_def_->input_size() >= 4) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -47,40 +47,21 @@ static void Deconv2d(int iters, ...@@ -47,40 +47,21 @@ static void Deconv2d(int iters,
} }
net.AddRandomInput<D, float>("Filter", net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h, {output_channels, channels, kernel_h,
kernel_w}); kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels}, true);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, net.AddInputFromArray<D, int32_t>("OutputShape", {4},
{batch, out_h, out_w, output_channels}); {batch, out_h, out_w, output_channels},
if (D == DeviceType::GPU) { true);
BufferToImage<D, T>(&net, "Input", "InputImage", OpDefBuilder("Deconv2D", "Deconv2dTest")
ops::BufferType::IN_OUT_CHANNEL); .Input("Input")
BufferToImage<D, T>(&net, "Filter", "FilterImage", .Input("Filter")
ops::BufferType::CONV2D_FILTER); .Input("OutputShape")
BufferToImage<D, T>(&net, "Bias", "BiasImage", .Input("Bias")
ops::BufferType::ARGUMENT); .Output("Output")
OpDefBuilder("Deconv2D", "Deconv2dTest") .AddIntsArg("strides", {stride, stride})
.Input("InputImage") .AddIntArg("padding", padding)
.Input("FilterImage") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Input("OutputShape") .Finalize(net.NewOperatorDef());
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("Input")
.Input("Filter")
.Input("OutputShape")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
}
net.Setup(D); net.Setup(D);
// Warm-up // Warm-up
......
...@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -41,40 +41,34 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
ops::FrameworkType model_type) { ops::FrameworkType model_type) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
const index_t batch = input_shape[0];
const index_t out_channels = filter_shape[2]; const index_t out_channels = filter_shape[2];
net.AddInputFromArray<D, float>("Input", input_shape, input_data); net.AddInputFromArray<D, float>("Input", input_shape, input_data);
net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data); net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data, true);
net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data); net.AddInputFromArray<D, float>("Bias", {out_channels}, bias_data, true);
net.TransformDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW); // TODO(liutuo): remove the unused transform
net.TransformFilterDataFormat<D, float>("Filter", HWOI, "FilterOIHW", OIHW);
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "FilterOIHW", "FilterImage",
ops::BufferType::CONV2D_FILTER);
if (model_type == ops::FrameworkType::CAFFE) { if (model_type == ops::FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage") .Input("Input")
.Input("FilterImage") .Input("FilterOIHW")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("padding_values", padding_size) .AddIntsArg("padding_values", padding_size)
.AddIntArg("framework_type", model_type) .AddIntArg("framework_type", model_type)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape); net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage") .Input("Input")
.Input("FilterImage") .Input("FilterOIHW")
.Input("OutputShape") .Input("OutputShape")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.AddIntsArg("strides", {stride, stride}) .AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding) .AddIntArg("padding", padding)
.AddIntsArg("padding_values", padding_size) .AddIntsArg("padding_values", padding_size)
...@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -82,10 +76,6 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} }
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else { } else {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape, ...@@ -102,7 +92,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.AddIntArg("framework_type", model_type) .AddIntArg("framework_type", model_type)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape); net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch, ...@@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch,
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>( net.AddRandomInput<D, T>(
"Filter", {output_channels, input_channels, kernel_h, kernel_w}); "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, T>("Bias", {output_channels}); net.AddRandomInput<D, T>("Bias", {output_channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
int out_h = 0; int out_h = 0;
...@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch, ...@@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch,
output_shape.push_back(out_h); output_shape.push_back(out_h);
output_shape.push_back(out_w); output_shape.push_back(out_w);
output_shape.push_back(output_channels); output_shape.push_back(output_channels);
net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape); net.AddInputFromArray<D, int32_t>("OutputShape", {4}, output_shape, true);
} else { } else {
paddings.push_back(padding); paddings.push_back(padding);
paddings.push_back(padding); paddings.push_back(padding);
...@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch, ...@@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch,
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on gpu
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
if (model_type == ops::FrameworkType::CAFFE) { if (model_type == ops::FrameworkType::CAFFE) {
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage") .Input("Input")
.Input("FilterImage") .Input("Filter")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings) .AddIntsArg("padding_values", paddings)
.AddIntArg("framework_type", model_type) .AddIntArg("framework_type", model_type)
...@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch, ...@@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch,
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else { } else {
OpDefBuilder("Deconv2D", "Deconv2dTest") OpDefBuilder("Deconv2D", "Deconv2dTest")
.Input("InputImage") .Input("Input")
.Input("FilterImage") .Input("Filter")
.Input("OutputShape") .Input("OutputShape")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type) .AddIntArg("padding", type)
.AddIntArg("framework_type", model_type) .AddIntArg("framework_type", model_type)
...@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch, ...@@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch,
// Run on device // Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4,
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-4,
1e-4); 1e-4);
}; };
......
...@@ -36,23 +36,12 @@ void DepthToSpace( ...@@ -36,23 +36,12 @@ void DepthToSpace(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (D == DeviceType::CPU) { OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
.Input("InputImage")
.Output("Output")
.AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape, ...@@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector<index_t> &input_shape,
"Output", NHWC); "Output", NHWC);
} else { } else {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest") OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImage") .Input("Input")
.Output("OutputImage") .Output("Output")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
} }
if (D == DeviceType::GPU) {
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
}
auto expected = net.CreateTensor<float>(expected_shape, expected_data); auto expected = net.CreateTensor<float>(expected_shape, expected_data);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
...@@ -134,28 +128,23 @@ void RandomTest(const int block_size, ...@@ -134,28 +128,23 @@ void RandomTest(const int block_size,
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output", net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, "Output",
NHWC); NHWC);
BufferToImage<D, T>(&net, "Input", "InputImg",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("DepthToSpace", "DepthToSpaceTest") OpDefBuilder("DepthToSpace", "DepthToSpaceTest")
.Input("InputImg") .Input("Input")
.AddIntArg("block_size", block_size) .AddIntArg("block_size", block_size)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("OutputImg") .Output("GPUOutput")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImg", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
if (DataTypeToEnum<T>::value == DT_FLOAT) { if (DataTypeToEnum<T>::value == DT_FLOAT) {
ExpectTensorNear<float>(*net.GetTensor("Output"), ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-5); *net.GetOutput("GPUOutput"), 1e-5);
} else { } else {
ExpectTensorNear<float>(*net.GetTensor("Output"), ExpectTensorNear<float>(*net.GetTensor("Output"),
*net.GetOutput("OPENCLOutput"), 1e-3, 1e-4); *net.GetOutput("GPUOutput"), 1e-3, 1e-4);
} }
} }
} // namespace } // namespace
......
...@@ -34,8 +34,9 @@ ...@@ -34,8 +34,9 @@
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/depthwise_conv2d.h" #include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/buffer/depthwise_conv2d.h" #include "mace/ops/opencl/buffer/depthwise_conv2d.h"
#include "mace/ops/opencl/image/depthwise_conv2d.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
...@@ -490,11 +491,27 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase { ...@@ -490,11 +491,27 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
public: public:
explicit DepthwiseConv2dOp(OpConstructContext *context) explicit DepthwiseConv2dOp(OpConstructContext *context)
: DepthwiseConv2dOpBase(context) { : DepthwiseConv2dOpBase(context) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>); kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
} else { } else {
mem_type = MemoryType::GPU_BUFFER;
kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>); kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
} }
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
1,
OpenCLBufferType::DW_CONV2D_FILTER,
mem_type) == MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
......
...@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters, ...@@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters,
} }
if (DataTypeToEnum<T>::value != DT_UINT8) { if (DataTypeToEnum<T>::value != DT_UINT8) {
net.AddRandomInput<D, float>( net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w}); "Filter", {multiplier, input_channels, kernel_h, kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}); net.AddRandomInput<D, float>("Bias", {input_channels * multiplier}, true);
} else { } else {
net.AddRandomInput<DeviceType::CPU, uint8_t>( net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier}); "Filter", {kernel_h, kernel_w, input_channels, multiplier}, true);
net.GetTensor("Filter")->SetScale(0.1); net.GetTensor("Filter")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, int32_t>( net.AddRandomInput<DeviceType::CPU, int32_t>(
"Bias", {input_channels * multiplier}); "Bias", {input_channels * multiplier}, true);
} }
if (D == DeviceType::CPU) { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
.Input("Bias") .Input("Bias")
...@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters, ...@@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters,
.AddIntsArg("dilations", {1, 1}) .AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::DW_CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
net.Setup(D); net.Setup(D);
......
此差异已折叠。
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/depthwise_deconv2d.h" #include "mace/ops/opencl/image/depthwise_deconv2d.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase { ...@@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
public: public:
explicit DepthwiseDeconv2dOp(OpConstructContext *context) explicit DepthwiseDeconv2dOp(OpConstructContext *context)
: Deconv2dOpBase(context) { : Deconv2dOpBase(context) {
MemoryType mem_type = MemoryType::GPU_IMAGE;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>); kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel<T>);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::DW_CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
if (operator_def_->input_size() >= 3) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2,
OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
......
...@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters, ...@@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters,
} }
net.AddRandomInput<D, float>("Filter", net.AddRandomInput<D, float>("Filter",
{1, channels, kernel_h, {1, channels, kernel_h,
kernel_w}); kernel_w}, true);
if (D == DeviceType::GPU) { OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
BufferToImage<D, T>(&net, "Input", "InputImage", .Input("Input")
ops::BufferType::IN_OUT_CHANNEL); .Input("Filter")
BufferToImage<D, T>(&net, "Filter", "FilterImage", .Output("Output")
ops::BufferType::DW_CONV2D_FILTER); .AddIntsArg("strides", {stride, stride})
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") .AddIntsArg("padding_values", {padding, padding})
.Input("InputImage") .AddIntArg("group", channels)
.Input("FilterImage") .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Output("Output") .Finalize(net.NewOperatorDef());
.AddIntsArg("strides", {stride, stride})
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("group", channels)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntsArg("padding_values", {padding, padding})
.AddIntArg("group", channels)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
}
net.Setup(D); net.Setup(D);
......
此差异已折叠。
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/eltwise.h" #include "mace/ops/opencl/image/eltwise.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -1086,12 +1087,28 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation { ...@@ -1086,12 +1087,28 @@ class EltwiseOp<DeviceType::GPU, T> : public Operation {
float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0); float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>( int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
"scalar_input_index", 1); "scalar_input_index", 1);
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::EltwiseKernel<T>( kernel_.reset(new opencl::image::EltwiseKernel<T>(
type, coeff, scalar_input, scalar_input_index)); type, coeff, scalar_input, scalar_input_index));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
// Transform filters
int input_size = operator_def_->input_size();
Workspace *ws = context->workspace();
for (int i = 0; i < input_size; ++i) {
if (ws->HasTensor(operator_def_->input(i)) &&
ws->GetTensor(operator_def_->input(i))->is_weight()) {
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input0 = this->Input(0); const Tensor *input0 = this->Input(0);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/matmul.h" #include "mace/ops/opencl/image/matmul.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase { ...@@ -351,11 +352,8 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
public: public:
explicit MatMulOp(OpConstructContext *context) explicit MatMulOp(OpConstructContext *context)
: MatMulOpBase(context) { : MatMulOpBase(context) {
if (context->device()->opencl_runtime()->UseImageMemory()) { MACE_UNUSED(context);
kernel_.reset(new opencl::image::MatMulKernel<T>); MACE_NOT_IMPLEMENTED;
} else {
MACE_NOT_IMPLEMENTED;
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
Validate(); Validate();
......
此差异已折叠。
此差异已折叠。
...@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform( ...@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform(
} }
}; };
} }
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册