提交 bfbe1a30 编写于 作者: 李寅

Merge branch 'unify-cpu-gpu' into 'master'

Unify cpu gpu

See merge request !877
...@@ -69,9 +69,9 @@ in one deployment file. ...@@ -69,9 +69,9 @@ in one deployment file.
- The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe). - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe).
If there are more than one tensors, use one line for a tensor. If there are more than one tensors, use one line for a tensor.
* - input_shapes * - input_shapes
- The shapes of the input tensors, in NHWC order. - The shapes of the input tensors, default is NHWC order.
* - output_shapes * - output_shapes
- The shapes of the output tensors, in NHWC order. - The shapes of the output tensors, default is NHWC order.
* - input_ranges * - input_ranges
- The numerical range of the input tensors' data, default [-1, 1]. It is only for test. - The numerical range of the input tensors' data, default [-1, 1]. It is only for test.
* - validation_inputs_data * - validation_inputs_data
...@@ -84,6 +84,10 @@ in one deployment file. ...@@ -84,6 +84,10 @@ in one deployment file.
- [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP.
* - input_data_types * - input_data_types
- [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32.
* - input_data_formats
- [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - output_data_formats
- [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
* - limit_opencl_kernel_time * - limit_opencl_kernel_time
- [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0. - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
* - obfuscate * - obfuscate
......
# one yaml config file can contain multi device info
devices:
# The name of the device
nanopi:
# arm64 or armhf
target_abis: [arm64, armhf]
# device soc, you can get it from device manual
target_socs: RK3399
# device model full name
models: FriendlyElec Nanopi M4
# device ip address
address: 10.0.0.0
# login username
username: user
# login password, is required when you can login into device without password
password: 1234567
raspberry:
target_abis: [armv7l]
target_socs: BCM2837
models: Raspberry Pi 3 Model B Plus Rev 1.3
address: 10.0.0.1
username: user
password: 123456
...@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) ...@@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false)
MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true)
MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true)
#undef MACE_GET_REPEATED_ARGUMENT_FUNC #undef MACE_GET_REPEATED_ARGUMENT_FUNC
bool IsQuantizedModel(const NetDef &net_def) {
return
ProtoArgHelper::GetOptionalArg<NetDef, int>(net_def, "quantize_flag", 0)
== 1;
}
} // namespace mace } // namespace mace
...@@ -55,6 +55,8 @@ class ProtoArgHelper { ...@@ -55,6 +55,8 @@ class ProtoArgHelper {
std::map<std::string, Argument> arg_map_; std::map<std::string, Argument> arg_map_;
}; };
bool IsQuantizedModel(const NetDef &def);
} // namespace mace } // namespace mace
#endif // MACE_CORE_ARG_HELPER_H_ #endif // MACE_CORE_ARG_HELPER_H_
...@@ -233,6 +233,11 @@ class Image : public BufferBase { ...@@ -233,6 +233,11 @@ class Image : public BufferBase {
} }
} }
inline DataType dtype() const {
MACE_CHECK_NOTNULL(buf_);
return data_type_;
}
void *buffer() { void *buffer() {
MACE_CHECK_NOTNULL(buf_); MACE_CHECK_NOTNULL(buf_);
return buf_; return buf_;
......
...@@ -34,7 +34,7 @@ class Device { ...@@ -34,7 +34,7 @@ class Device {
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
virtual OpenCLRuntime *opencl_runtime() = 0; virtual OpenCLRuntime *opencl_runtime() = 0;
#endif #endif // MACE_ENABLE_OPENCL
virtual CPURuntime *cpu_runtime() = 0; virtual CPURuntime *cpu_runtime() = 0;
virtual Allocator *allocator() = 0; virtual Allocator *allocator() = 0;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/memory_optimizer.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <sstream>
#include <unordered_set>
#include "mace/core/arg_helper.h"
#include "mace/core/macros.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace {
bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) {
static const std::unordered_set<std::string> kReuseOp = {
"Reshape", "Identity", "Squeeze", "ExpandDims"
};
return kReuseOp.count(op_type) == 1;
}
void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) {
if (tensor_ref_count_.count(tensor_name) == 0) {
tensor_ref_count_.emplace(tensor_name, 1);
} else {
tensor_ref_count_[tensor_name] += 1;
}
}
void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) {
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
if (tensor_ref_count_.count(op_def->input(i)) == 1) {
tensor_ref_count_[op_def->input(i)] += 1;
}
}
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (tensor_ref_count_.count(op_def->output(i)) == 0) {
tensor_ref_count_.emplace(op_def->output(i), 0);
}
}
}
MemoryBlock MemoryOptimizer::CreateMemoryBlock(
std::vector<int64_t> shape,
DataType dt,
mace::MemoryType mem_type) {
MemoryBlock block;
#ifdef MACE_ENABLE_OPENCL
if (mem_type == MemoryType::GPU_IMAGE) {
std::vector<size_t> image_shape;
if (shape.size() == 2) {
shape = {shape[0], 1, 1, shape[1]};
} else {
MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input";
}
OpenCLUtil::CalImage2DShape(shape,
OpenCLBufferType::IN_OUT_CHANNEL,
&image_shape);
block.set_x(image_shape[0]);
block.set_y(image_shape[1]);
return block;
}
#endif // MACE_ENABLE_OPENCL
MACE_UNUSED(mem_type);
int64_t op_mem_size = std::accumulate(shape.begin(),
shape.end(),
GetEnumTypeSize(dt),
std::multiplies<int64_t>());
block.set_x(op_mem_size);
block.set_y(1);
return block;
}
void MemoryOptimizer::Optimize(
const mace::OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types) {
MACE_LATENCY_LOGGER(2, "Optimize memory");
if (op_def->output_size() != op_def->output_shape_size()) {
VLOG(1) << op_def->name()
<< ": the number of output shape "
<< "is not equal to the number of output";
return;
}
auto device = static_cast<DeviceType>(op_def->device_type());
DataType op_dtype = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
*op_def,
"T",
static_cast<int>(DT_FLOAT)));
MACE_CHECK(
op_def->output_type_size() == 0 ||
op_def->output_size() == op_def->output_type_size(),
"operator output size != operator output type size",
op_def->output_size(),
op_def->output_type_size());
DataType dt;
int output_size = op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (i < op_def->output_type_size()) {
dt = op_def->output_type(i);
} else {
dt = op_dtype;
}
int best_mem_id = -1;
MemoryType mem_type = MemoryType::CPU_BUFFER;
if (device == DeviceType::GPU) {
mem_type = mem_types.at(op_def->output(i));
}
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type);
MemoryBlock best_mem_block;
if (IsMemoryReuseOp(op_def->type())) {
if (tensor_mem_map_.count(op_def->input(0)) == 1) {
best_mem_id = tensor_mem_map_[op_def->input(0)].first;
}
} else {
auto shape = std::vector<int64_t>(
op_def->output_shape(i).dims().begin(),
op_def->output_shape(i).dims().end());
int64_t op_mem_size = op_mem_block.x() * op_mem_block.y();
int64_t best_added_mem_size = LLONG_MAX;
int64_t best_wasted_mem_size = LLONG_MAX;
int64_t old_mem_size = 0, new_mem_size = 0;
MemoryBlock new_mem_block;
for (auto idle_mem_id : idle_blocks_) {
if (mem_blocks_[idle_mem_id].mem_type() == mem_type) {
if (mem_type == MemoryType::GPU_IMAGE) {
// GPU Image could reuse memory with same data type only
if (mem_blocks_[idle_mem_id].data_type() != dt) {
continue;
}
old_mem_size =
mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y();
new_mem_block.set_x(std::max<int64_t>(mem_blocks_[idle_mem_id].x(),
op_mem_block.x()));
new_mem_block.set_y(std::max<int64_t>(mem_blocks_[idle_mem_id].y(),
op_mem_block.y()));
new_mem_size = new_mem_block.x() * new_mem_block.y();
} else {
old_mem_size = mem_blocks_[idle_mem_id].x();
new_mem_size = std::max(op_mem_size, old_mem_size);
new_mem_block.set_x(new_mem_size);
}
int64_t added_mem_size = new_mem_size - old_mem_size;
int64_t wasted_mem_size = new_mem_size - op_mem_size;
// minimize add_mem_size; if best_mem_add_size is 0,
// then minimize waste_mem_size
if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size)
|| (best_added_mem_size == 0 &&
wasted_mem_size < best_wasted_mem_size)) {
best_mem_id = idle_mem_id;
best_added_mem_size = added_mem_size;
best_wasted_mem_size = wasted_mem_size;
best_mem_block = new_mem_block;
}
}
}
if (best_added_mem_size <= op_mem_size) {
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
mem_blocks_[best_mem_id] = best_mem_block;
idle_blocks_.erase(best_mem_id);
} else {
best_mem_id = static_cast<int>(mem_blocks_.size());
best_mem_block.set_mem_id(best_mem_id);
best_mem_block.set_data_type(dt);
best_mem_block.set_mem_type(mem_type);
best_mem_block.set_x(op_mem_block.x());
best_mem_block.set_y(op_mem_block.y());
mem_blocks_.push_back(best_mem_block);
}
}
if (best_mem_id != -1) {
if (mem_ref_count_.count(best_mem_id) == 1) {
mem_ref_count_[best_mem_id] += 1;
} else {
mem_ref_count_[best_mem_id] = 1;
}
tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt);
}
}
// de-refer input tensors
int input_size = op_def->input_size();
for (int i = 0; i < input_size; ++i) {
auto &input_name = op_def->input(i);
if (tensor_ref_count_.count(input_name) == 1) {
tensor_ref_count_[input_name] -= 1;
if (tensor_ref_count_.at(input_name) == 0 &&
tensor_mem_map_.count(input_name) == 1) {
int mem_id = tensor_mem_map_.at(input_name).first;
mem_ref_count_[mem_id] -= 1;
if (mem_ref_count_.at(mem_id) == 0) {
idle_blocks_.insert(mem_id);
}
} else {
MACE_CHECK(tensor_ref_count_.at(input_name) >= 0);
}
}
}
}
const std::vector<MemoryBlock>& MemoryOptimizer::mem_blocks() const {
return mem_blocks_;
}
const std::unordered_map<std::string, std::pair<int, DataType>>&
MemoryOptimizer::tensor_mem_map() const {
return tensor_mem_map_;
}
std::string MemoryOptimizer::DebugInfo() const {
auto memory_type_to_str = [](const MemoryType type) -> std::string {
if (type == MemoryType::CPU_BUFFER) {
return "CPU_BUFFER";
} else if (type == MemoryType::GPU_BUFFER) {
return "GPU_BUFFER";
} else if (type == MemoryType::GPU_IMAGE) {
return "GPU_IMAGE";
} else {
return "UNKNOWN";
}
};
std::stringstream sstream;
sstream << "\n";
size_t block_size = mem_blocks_.size();
for (size_t i = 0; i < block_size; ++i) {
sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type())
<< " ";
if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) {
sstream << DataTypeToString(mem_blocks_[i].data_type()) << " "
"[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]";
} else {
sstream << "[" << mem_blocks_[i].x() << "]";
}
sstream << "\n";
}
return sstream.str();
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_
#define MACE_CORE_MEMORY_OPTIMIZER_H_
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "mace/proto/mace.pb.h"
#include "mace/core/types.h"
namespace mace {
class MemoryBlock {
public:
inline void set_mem_id(int mem_id) {
mem_id_ = mem_id;
}
inline int mem_id() const {
return mem_id_;
}
inline void set_data_type(DataType data_type) {
data_type_ = data_type;
}
inline DataType data_type() const {
return data_type_;
}
inline void set_mem_type(MemoryType mem_type) {
mem_type_ = mem_type;
}
inline MemoryType mem_type() const {
return mem_type_;
}
inline void set_x(int64_t x) {
x_ = x;
}
inline int64_t x() const {
return x_;
}
inline void set_y(int64_t y) {
y_ = y;
}
inline int64_t y() const {
return y_;
}
private:
int mem_id_;
DataType data_type_;
MemoryType mem_type_;
int64_t x_;
int64_t y_;
};
class MemoryOptimizer {
public:
static bool IsMemoryReuseOp(const std::string &op_type);
void UpdateTensorRef(const std::string &tensor_name);
void UpdateTensorRef(const OperatorDef *op_def);
void Optimize(const OperatorDef *op_def,
const std::unordered_map<std::string, MemoryType> &mem_types);
const std::vector<MemoryBlock> &mem_blocks() const;
const std::unordered_map<std::string,
std::pair<int, DataType>> &tensor_mem_map() const;
std::string DebugInfo() const;
private:
MemoryBlock CreateMemoryBlock(std::vector<int64_t> shape,
DataType dt,
MemoryType mem_type);
private:
std::unordered_map<std::string, int> tensor_ref_count_;
std::vector<MemoryBlock> mem_blocks_;
// tensor name : <mem_id, data_type>
// Buffer Memory do not different data type, so store the data type.
std::unordered_map<std::string, std::pair<int, DataType>> tensor_mem_map_;
std::unordered_map<int, int> mem_ref_count_;
std::set<int> idle_blocks_;
};
} // namespace mace
#endif // MACE_CORE_MEMORY_OPTIMIZER_H_
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/op_context.h" #include "mace/core/op_context.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -25,13 +26,94 @@ ...@@ -25,13 +26,94 @@
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_util.h"
#endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
namespace {
struct InternalOutputInfo {
InternalOutputInfo(const MemoryType mem_type,
const DataType dtype,
const std::vector<index_t> &shape,
int op_idx)
: mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {}
MemoryType mem_type; // transformed memory type
DataType dtype;
std::vector<index_t> shape; // tensor shape
int op_idx; // operation which generate the tensor
};
#ifdef MACE_ENABLE_OPENCL
std::string TransformedName(const std::string &input_name,
const mace::MemoryType mem_type) {
std::stringstream ss;
ss << input_name << "_mem_type_" << mem_type;
return ss.str();
}
#endif // MACE_ENABLE_OPENCL
} // namespace
std::unique_ptr<Operation> SerialNet::CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat data_format_flag,
bool is_quantize_model) {
// Create the Operation
DeviceType target_device_type = target_device_->device_type();
// Get available devices
auto available_devices = op_registry->AvailableDevices(op_def->type());
// Find the device type to run the op.
// If the target_device_type in available devices, use target_device_type,
// otherwise, fallback to CPU device.
DeviceType device_type = DeviceType::CPU;
construct_context->set_device(cpu_device_);
construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context->set_device(target_device_);
if (target_device_->device_type() == DeviceType::GPU) {
construct_context->set_output_mem_type(MemoryType::GPU_IMAGE);
}
break;
}
}
op_def->set_device_type(device_type);
// transpose output shape if run on CPU (default format is NHWC)
if (!is_quantize_model && device_type == DeviceType::CPU &&
op_def->output_shape_size() == op_def->output_size()) {
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
if (data_format_flag == NHWC &&
op_def->output_shape(out_idx).dims_size() == 4) {
// NHWC -> NCHW
std::vector<index_t> output_shape =
TransposeShape<index_t, index_t>(
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
{0, 3, 1, 2});
for (int i = 0; i < 4; ++i) {
op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]);
}
}
}
}
construct_context->set_operator_def(op_def);
std::unique_ptr<Operation> op(
op_registry->CreateOperation(construct_context, device_type));
return std::move(op);
}
SerialNet::SerialNet(const OpRegistryBase *op_registry, SerialNet::SerialNet(const OpRegistryBase *op_registry,
const NetDef *net_def, const NetDef *net_def,
Workspace *ws, Workspace *ws,
Device *target_device, Device *target_device,
const NetMode mode) MemoryOptimizer *mem_optimizer)
: NetBase(), : NetBase(),
ws_(ws), ws_(ws),
target_device_(target_device), target_device_(target_device),
...@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, ...@@ -40,44 +122,211 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
target_device->cpu_runtime()->policy(), target_device->cpu_runtime()->policy(),
target_device->cpu_runtime()->use_gemmlowp())) { target_device->cpu_runtime()->use_gemmlowp())) {
MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
// Create Operations // output tensor : related information
DeviceType target_device_type = target_device_->device_type(); std::unordered_map<std::string, InternalOutputInfo> output_map;
// used for memory optimization
std::unordered_map<std::string, MemoryType> output_mem_map;
std::unordered_map<std::string, std::string> transformed_map;
// add input information
MemoryType target_mem_type;
// quantize model flag
bool is_quantize_model = IsQuantizedModel(*net_def);
//
DataFormat data_format_flag = NHWC;
if (target_device_->device_type() == DeviceType::CPU) {
target_mem_type = MemoryType::CPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
// Only could be NONE or NHWC
auto input_data_format = static_cast<DataFormat>(
input_info.data_format());
if (!is_quantize_model &&
input_data_format == NHWC &&
input_info.dims_size() == 4) {
// NHWC -> NCHW
input_shape =
TransposeShape<index_t, index_t>(input_shape, {0, 3, 1, 2});
} else if (input_data_format == DataFormat::DF_NONE) {
data_format_flag = DataFormat::DF_NONE;
}
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#ifdef MACE_ENABLE_OPENCL
else { // GPU NOLINT[readability/braces]
target_mem_type = MemoryType::GPU_BUFFER;
for (auto &input_info : net_def->input_info()) {
std::vector<index_t> input_shape =
std::vector<index_t>(input_info.dims().begin(),
input_info.dims().end());
output_map.emplace(input_info.name(), InternalOutputInfo(
target_mem_type, DataType::DT_FLOAT, input_shape, -1));
}
}
#endif // MACE_ENABLE_OPENCL
OpConstructContext construct_context(ws_); OpConstructContext construct_context(ws_);
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
// Create the Operation // Create operation
const int op_device = auto op = CreateOperation(op_registry,
&construct_context,
op_def,
data_format_flag,
is_quantize_model);
#ifdef MACE_ENABLE_OPENCL
// Add input transform operation if necessary
if (target_device_->device_type() == DeviceType::GPU) {
const DataType dt =
static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
operator_def, "device", static_cast<int>(target_device_type)); *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
if (op_device == target_device_type) { // the outputs' memory type of the operation
// Get available devices (sorted based on priority) MemoryType out_mem_type = construct_context.output_mem_type();
OperatorDef temp_def(operator_def); int input_size = op_def->input_size();
auto available_devices = op_registry->AvailableDevices(temp_def.type()); for (int i = 0; i < input_size; ++i) {
// Find the device type to run the op. if (output_map.count(op_def->input(i)) == 1) {
// If the target_device_type in available devices, use target_device_type, // if op is memory-reuse op, no transformation
// otherwise, fallback to CPU device. if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
DeviceType device_type = DeviceType::CPU; out_mem_type = output_map.at(op_def->input(i)).mem_type;
construct_context.set_device(cpu_device_);
for (auto device : available_devices) {
if (device == target_device_type) {
device_type = target_device_type;
construct_context.set_device(target_device_);
break; break;
} }
// check whether is the output tensor of other operation
if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
output_map.at(op_def->input(i)).dtype != dt) {
auto key = TransformedName(op_def->input(i), out_mem_type);
auto &output_info = output_map.at(op_def->input(i));
// check whether the tensor has been transformed
if (transformed_map.count(key) == 0) {
VLOG(1) << "Add Transform operation to transform tensor '"
<< op_def->input(i) << "', from memory type "
<< output_info.mem_type << " to " << out_mem_type
<< ", from Data Type " << output_info.dtype << " to "
<< dt;
std::string input_name = op_def->input(i);
std::string t_input_name =
TransformedName(input_name,
out_mem_type);
op_def->set_input(i, t_input_name);
auto input_shape = output_info.shape;
if (output_info.mem_type == MemoryType::CPU_BUFFER &&
input_shape.size() == 4) {
// NCHW -> NHWC
input_shape =
TransposeShape<index_t, index_t>(input_shape,
{0, 2, 3, 1});
} }
temp_def.set_device_type(device_type); auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
construct_context.set_operator_def(&temp_def); input_name, input_shape, t_input_name,
std::unique_ptr<Operation> op( dt, out_mem_type);
op_registry->CreateOperation(&construct_context, device_type, mode)); auto transform_op = CreateOperation(
if (op) { op_registry,
&construct_context,
transform_op_def,
data_format_flag);
operators_.emplace_back(std::move(transform_op));
transformed_map.emplace(key, t_input_name);
output_mem_map[t_input_name] = out_mem_type;
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
} else {
op_def->set_input(i, transformed_map[key]);
}
}
} else {
MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
&& ws_->GetTensor(op_def->input(i))->is_weight(),
"Tensor ", op_def->input(i), " of ",
op_def->name(), " not allocated");
}
}
// update the map : output_tensor -> Operation
for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
output_mem_map[op_def->output(out_idx)] = out_mem_type;
output_map.emplace(
op_def->output(out_idx),
InternalOutputInfo(
out_mem_type,
dt,
op_def->output_shape().empty() ?
std::vector<index_t>() :
std::vector<index_t>(
op_def->output_shape(out_idx).dims().begin(),
op_def->output_shape(out_idx).dims().end()),
static_cast<int>(operators_.size())));
}
}
#endif // MACE_ENABLE_OPENCL
operators_.emplace_back(std::move(op)); operators_.emplace_back(std::move(op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(op_def.get());
}
#ifdef MACE_ENABLE_OPENCL
// Transform the output tensor if necessary
if (target_device_->device_type() == DeviceType::GPU) {
for (auto &output_info : net_def->output_info()) {
auto &internal_output_info = output_map.at(output_info.name());
if ((internal_output_info.mem_type != target_mem_type &&
internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
internal_output_info.dtype != DataType::DT_FLOAT) {
VLOG(1) << "Add Transform operation to transform output tensor '"
<< output_info.name() << "', from memory type "
<< internal_output_info.mem_type
<< " to " << target_mem_type
<< ", from Data Type " << internal_output_info.dtype
<< " to " << DataType::DT_FLOAT;
std::string t_output_name = TransformedName(output_info.name(),
target_mem_type);
auto output_op_def =
operators_[internal_output_info.op_idx]->operator_def();
int output_size = output_op_def->output_size();
for (int i = 0; i < output_size; ++i) {
if (output_op_def->output(i) == output_info.name()) {
output_op_def->set_output(i, t_output_name);
// update the output : mem_type map
output_mem_map[t_output_name] = output_mem_map[output_info.name()];
output_mem_map[output_info.name()] = target_mem_type;
}
}
auto output_data_format =
static_cast<DataFormat>(output_info.data_format());
auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
t_output_name,
internal_output_info.shape,
output_info.name(),
DataType::DT_FLOAT,
target_mem_type);
auto transform_op = CreateOperation(
op_registry,
&construct_context,
transform_op_def,
output_data_format);
operators_.emplace_back(std::move(transform_op));
// where to do graph reference count.
mem_optimizer->UpdateTensorRef(transform_op_def.get());
}
} }
} }
#endif // MACE_ENABLE_OPENCL
// Update output tensor reference
for (auto &output_info : net_def->output_info()) {
mem_optimizer->UpdateTensorRef(output_info.name());
} }
// Do memory optimization
for (auto &op : operators_) {
VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type()
<< ", " << op->debug_def().type() << ">";
mem_optimizer->Optimize(op->operator_def().get(), output_mem_map);
}
VLOG(1) << mem_optimizer->DebugInfo();
} }
MaceStatus SerialNet::Init() { MaceStatus SerialNet::Init() {
// TODO(liuqi): where to do memory reuse.
MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
OpInitContext init_context(ws_); OpInitContext init_context(ws_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
...@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() { ...@@ -95,18 +344,18 @@ MaceStatus SerialNet::Init() {
} }
MaceStatus SerialNet::Run(RunMetadata *run_metadata) { MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
// TODO(liuqi): In/Out Buffer Transform
MACE_MEMORY_LOGGING_GUARD(); MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
OpContext context(ws_, cpu_device_); OpContext context(ws_, cpu_device_);
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
auto &op = *iter; auto &op = *iter;
DeviceType device_type = op->device_type(); DeviceType device_type = op->device_type();
MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(),
"<", device_type, ", ", op->debug_def().type(), ">", "<", device_type, ", ", op->debug_def().type(),
". mem_id: ", ", ",
MakeListString(op->debug_def().mem_id().data(), ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op->debug_def().mem_id().size())); op->debug_def(), "T", static_cast<int>(DT_FLOAT)),
">");
if (device_type == target_device_->device_type()) { if (device_type == target_device_->device_type()) {
context.set_device(target_device_); context.set_device(target_device_);
} else { } else {
...@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -173,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
float max_v = std::numeric_limits<float>::lowest(); float max_v = std::numeric_limits<float>::lowest();
float min_v = std::numeric_limits<float>::max(); float min_v = std::numeric_limits<float>::max();
Tensor::MappingGuard guard(op->Output(i)); Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>(); auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) { for (index_t j = 0; j < op->Output(i)->size(); ++j) {
max_v = std::max(max_v, output_data[j]); max_v = std::max(max_v, output_data[j]);
min_v = std::min(min_v, output_data[j]); min_v = std::min(min_v, output_data[j]);
...@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { ...@@ -189,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
std::vector<int> bin_distribution(bin_size, 0); std::vector<int> bin_distribution(bin_size, 0);
float bin_v = (max_v - min_v) / bin_size; float bin_v = (max_v - min_v) / bin_size;
Tensor::MappingGuard guard(op->Output(i)); Tensor::MappingGuard guard(op->Output(i));
const float *output_data = op->Output(i)->data<float>(); auto *output_data = op->Output(i)->data<float>();
for (index_t j = 0; j < op->Output(i)->size(); ++j) { for (index_t j = 0; j < op->Output(i)->size(); ++j) {
int ind = static_cast<int>((output_data[j] - min_v) / bin_v); int index = static_cast<int>((output_data[j] - min_v) / bin_v);
if (ind < 0) if (index < 0)
ind = 0; index = 0;
else if (ind > bin_size-1) else if (index > bin_size-1)
ind = bin_size-1; index = bin_size-1;
bin_distribution[ind]++; bin_distribution[index]++;
} }
LOG(INFO) << "Tensor range @@" << op->debug_def().output(i) LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
<< "@@" << min_v << "," << max_v<< "@@" << "@@" << min_v << "," << max_v<< "@@"
......
...@@ -27,6 +27,7 @@ namespace mace { ...@@ -27,6 +27,7 @@ namespace mace {
class RunMetadata; class RunMetadata;
class Workspace; class Workspace;
class MemoryOptimizer;
class NetBase { class NetBase {
public: public:
...@@ -47,12 +48,20 @@ class SerialNet : public NetBase { ...@@ -47,12 +48,20 @@ class SerialNet : public NetBase {
const NetDef *net_def, const NetDef *net_def,
Workspace *ws, Workspace *ws,
Device *target_device, Device *target_device,
const NetMode mode = NetMode::NORMAL); MemoryOptimizer * mem_optimizer);
MaceStatus Init() override; MaceStatus Init() override;
MaceStatus Run(RunMetadata *run_metadata = nullptr) override; MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
private:
std::unique_ptr<Operation> CreateOperation(
const OpRegistryBase *op_registry,
OpConstructContext *construct_context,
std::shared_ptr<OperatorDef> op_def,
DataFormat input_format,
bool is_quantize_model = false);
protected: protected:
Workspace *ws_; Workspace *ws_;
Device *target_device_; Device *target_device_;
......
...@@ -23,16 +23,12 @@ namespace mace { ...@@ -23,16 +23,12 @@ namespace mace {
OpConstructContext::OpConstructContext(Workspace *ws) OpConstructContext::OpConstructContext(Workspace *ws)
: operator_def_(nullptr), ws_(ws), device_(nullptr) {} : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
OpConstructContext::OpConstructContext(OperatorDef *operator_def,
Workspace *ws,
Device *device)
: operator_def_(operator_def), ws_(ws), device_(device) {}
OpInitContext::OpInitContext(Workspace *ws, Device *device) OpInitContext::OpInitContext(Workspace *ws, Device *device)
: ws_(ws), device_(device) {} : ws_(ws), device_(device) {}
Operation::Operation(OpConstructContext *context) Operation::Operation(OpConstructContext *context)
: operator_def_(std::make_shared<OperatorDef>(*(context->operator_def()))) : operator_def_(context->operator_def())
{} {}
MaceStatus Operation::Init(OpInitContext *context) { MaceStatus Operation::Init(OpInitContext *context) {
...@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) { ...@@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) {
": Encountered a non-existing input tensor: ", input_str); ": Encountered a non-existing input tensor: ", input_str);
inputs_.push_back(tensor); inputs_.push_back(tensor);
} }
// TODO(liuqi): filter transform
for (int i = 0; i < operator_def_->output_size(); ++i) { for (int i = 0; i < operator_def_->output_size(); ++i) {
const std::string output_str = operator_def_->output(i); const std::string output_str = operator_def_->output(i);
if (ws->HasTensor(output_str)) { if (ws->HasTensor(output_str)) {
// TODO(liuqi): Workspace should pre-allocate all of the output tensors
outputs_.push_back(ws->GetTensor(output_str)); outputs_.push_back(ws->GetTensor(output_str));
} else { } else {
MACE_CHECK( MACE_CHECK(
...@@ -66,7 +60,7 @@ MaceStatus Operation::Init(OpInitContext *context) { ...@@ -66,7 +60,7 @@ MaceStatus Operation::Init(OpInitContext *context) {
} }
outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
output_str, context->device()->allocator(), output_type))); output_str, context->device()->allocator(), output_type)));
}
if (i < operator_def_->output_shape_size()) { if (i < operator_def_->output_shape_size()) {
std::vector<index_t> std::vector<index_t>
shape_configured(operator_def_->output_shape(i).dims_size()); shape_configured(operator_def_->output_shape(i).dims_size());
...@@ -76,7 +70,6 @@ MaceStatus Operation::Init(OpInitContext *context) { ...@@ -76,7 +70,6 @@ MaceStatus Operation::Init(OpInitContext *context) {
ws->GetTensor(output_str)->SetShapeConfigured(shape_configured); ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
} }
} }
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -164,19 +157,23 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices( ...@@ -164,19 +157,23 @@ const std::set<DeviceType> OpRegistryBase::AvailableDevices(
std::unique_ptr<Operation> OpRegistryBase::CreateOperation( std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type, DeviceType device_type) const {
const NetMode mode) const { auto operator_def = context->operator_def();
OperatorDef *operator_def = context->operator_def(); DataType dtype = static_cast<DataType>(
const DataType dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>( ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
*operator_def, "T", static_cast<int>(DT_FLOAT))); *operator_def, "T", static_cast<int>(DT_FLOAT)));
const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>( if (device_type == DeviceType::CPU && dtype == DT_HALF) {
*operator_def, "mode", static_cast<int>(NetMode::NORMAL)); int arg_size = operator_def->arg_size();
const NetMode op_mode = static_cast<NetMode>(op_mode_i); for (int i = 0; i < arg_size; ++i) {
VLOG(3) << "Creating operator " << operator_def->name() << "(" if (operator_def->arg(i).name() == "T") {
operator_def->mutable_arg(i)->set_i(DT_FLOAT);
}
}
dtype = DT_FLOAT;
}
VLOG(1) << "Creating operator " << operator_def->name() << "("
<< operator_def->type() << "<" << dtype << ">" << ") on " << operator_def->type() << "<" << dtype << ">" << ") on "
<< device_type; << device_type;
if (op_mode == mode) {
const std::string op_type = context->operator_def()->type(); const std::string op_type = context->operator_def()->type();
MACE_CHECK(registry_.count(op_type) != 0, MACE_CHECK(registry_.count(op_type) != 0,
op_type, " operation is not registered."); op_type, " operation is not registered.");
...@@ -189,8 +186,5 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation( ...@@ -189,8 +186,5 @@ std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
LOG(FATAL) << "Key not registered: " << key; LOG(FATAL) << "Key not registered: " << key;
} }
return registry_.at(op_type)->creators.at(key)(context); return registry_.at(op_type)->creators.at(key)(context);
} else {
return nullptr;
}
} }
} // namespace mace } // namespace mace
...@@ -33,14 +33,13 @@ namespace mace { ...@@ -33,14 +33,13 @@ namespace mace {
class OpConstructContext { class OpConstructContext {
public: public:
explicit OpConstructContext(Workspace *ws); explicit OpConstructContext(Workspace *ws);
OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
~OpConstructContext() = default; ~OpConstructContext() = default;
inline void set_operator_def(OperatorDef *operator_def) { inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
operator_def_ = operator_def; operator_def_ = operator_def;
} }
inline OperatorDef *operator_def() const { inline std::shared_ptr<OperatorDef> operator_def() const {
return operator_def_; return operator_def_;
} }
...@@ -56,10 +55,19 @@ class OpConstructContext { ...@@ -56,10 +55,19 @@ class OpConstructContext {
return device_; return device_;
} }
inline void set_output_mem_type(MemoryType type) {
output_mem_type_ = type;
}
inline MemoryType output_mem_type() const {
return output_mem_type_;
}
private: private:
OperatorDef *operator_def_; std::shared_ptr<OperatorDef> operator_def_;
Workspace *ws_; Workspace *ws_;
Device *device_; Device *device_;
MemoryType output_mem_type_; // used for transform memory
}; };
// memory_optimizer, device // memory_optimizer, device
...@@ -131,14 +139,18 @@ class Operation { ...@@ -131,14 +139,18 @@ class Operation {
} }
inline void set_debug_def( inline void set_debug_def(
const std::shared_ptr<const OperatorDef> &operator_def) { const std::shared_ptr<OperatorDef> &operator_def) {
operator_def_ = operator_def; operator_def_ = operator_def;
} }
inline bool has_debug_def() const { return operator_def_ != nullptr; } inline bool has_debug_def() const { return operator_def_ != nullptr; }
inline std::shared_ptr<OperatorDef> operator_def() {
return operator_def_;
}
protected: protected:
std::shared_ptr<const OperatorDef> operator_def_; std::shared_ptr<OperatorDef> operator_def_;
std::vector<const Tensor *> inputs_; std::vector<const Tensor *> inputs_;
std::vector<Tensor *> outputs_; std::vector<Tensor *> outputs_;
...@@ -190,8 +202,7 @@ class OpRegistryBase { ...@@ -190,8 +202,7 @@ class OpRegistryBase {
std::unique_ptr<Operation> CreateOperation( std::unique_ptr<Operation> CreateOperation(
OpConstructContext *context, OpConstructContext *context,
DeviceType device_type, DeviceType device_type) const;
const NetMode mode) const;
template <class DerivedType> template <class DerivedType>
static std::unique_ptr<Operation> DefaultCreator( static std::unique_ptr<Operation> DefaultCreator(
......
...@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime( ...@@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime(
is_profiling_enabled_(false), is_profiling_enabled_(false),
opencl_version_(CL_VER_UNKNOWN), opencl_version_(CL_VER_UNKNOWN),
gpu_type_(UNKNOWN), gpu_type_(UNKNOWN),
mem_type_(MemoryType::GPU_IMAGE) { mem_type_(MemoryType::GPU_IMAGE),
scratch_image_manager_(new ScratchImageManager) {
std::vector<cl::Platform> all_platforms; std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms); cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) { if (all_platforms.size() == 0) {
...@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const { ...@@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const {
return is_profiling_enabled_; return is_profiling_enabled_;
} }
ScratchImageManager* OpenCLRuntime::scratch_image_manager() const {
return scratch_image_manager_.get();
}
} // namespace mace } // namespace mace
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "mace/core/file_storage.h" #include "mace/core/file_storage.h"
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/scratch_image.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/utils/string_util.h" #include "mace/utils/string_util.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
...@@ -82,6 +83,7 @@ class OpenCLRuntime { ...@@ -82,6 +83,7 @@ class OpenCLRuntime {
uint64_t device_global_mem_cache_size() const; uint64_t device_global_mem_cache_size() const;
uint32_t device_compute_units() const; uint32_t device_compute_units() const;
Tuner<uint32_t> *tuner(); Tuner<uint32_t> *tuner();
ScratchImageManager *scratch_image_manager() const;
bool is_opencl_avaliable(); bool is_opencl_avaliable();
// TODO(liuqi): remove this function in the future, make decision at runtime. // TODO(liuqi): remove this function in the future, make decision at runtime.
bool UseImageMemory(); bool UseImageMemory();
...@@ -134,6 +136,7 @@ class OpenCLRuntime { ...@@ -134,6 +136,7 @@ class OpenCLRuntime {
OpenCLVersion opencl_version_; OpenCLVersion opencl_version_;
GPUType gpu_type_; GPUType gpu_type_;
MemoryType mem_type_; MemoryType mem_type_;
std::unique_ptr<ScratchImageManager> scratch_image_manager_;
// All OpenCL object must be a pointer and manually deleted before unloading // All OpenCL object must be a pointer and manually deleted before unloading
// OpenCL library. // OpenCL library.
std::shared_ptr<cl::Context> context_; std::shared_ptr<cl::Context> context_;
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/opencl_util.h"
#include <utility>
#include "mace/utils/logging.h"
namespace mace {
namespace {
// [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic, H * W * (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1];
(*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]);
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* MIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[1]);
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[0]);
(*image_shape)[1] = 1;
}
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> *image_shape,
const int blk_size) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2));
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[2] * shape[3];
(*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
(*image_shape)[1] = shape[0] * shape[1];
}
// [Ic * H * W, (Oc + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = shape[1] * shape[2] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[0]);
}
// [(Ic + 3) / 4 * H * W, Oc]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* OIHW */
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3];
(*image_shape)[1] = shape[0];
}
} // namespace
void OpenCLUtil::CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_block_size) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) {
case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape);
break;
case DW_CONV2D_FILTER:
CalDepthwiseConv2dFilterImageShape(shape, image_shape);
break;
case IN_OUT_CHANNEL:
CalInOutputImageShape(shape, image_shape);
break;
case ARGUMENT:
CalArgImageShape(shape, image_shape);
break;
case IN_OUT_HEIGHT:
CalInOutHeightImageShape(shape, image_shape);
break;
case IN_OUT_WIDTH:
CalInOutWidthImageShape(shape, image_shape);
break;
case WINOGRAD_FILTER:
CalWinogradFilterImageShape(shape, image_shape, wino_block_size);
break;
case WEIGHT_HEIGHT:
CalWeightHeightImageShape(shape, image_shape);
break;
case WEIGHT_WIDTH:
CalWeightWidthImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::shared_ptr<OperatorDef> OpenCLUtil::CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const mace::MemoryType mem_type) {
std::unique_ptr<OperatorDef> op(new OperatorDef);
std::string op_name = "mace_node_" + output_name;
op->set_name(op_name);
op->set_type("BufferTransform");
op->add_input(input_name);
op->add_output(output_name);
Argument *arg = op->add_arg();
arg->set_name("buffer_type");
arg->set_i(static_cast<int32_t>(OpenCLBufferType::IN_OUT_CHANNEL));
arg = op->add_arg();
arg->set_name("mem_type");
arg->set_i(static_cast<int32_t>(mem_type));
arg = op->add_arg();
arg->set_name("T");
arg->set_i(static_cast<int32_t>(dt));
arg = op->add_arg();
arg->set_name("device");
arg->set_i(DeviceType::GPU);
if (!input_shape.empty()) {
OutputShape *shape = op->add_output_shape();
for (auto value : input_shape) {
shape->add_dims(value);
}
}
return std::move(op);
}
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
#include <memory>
#include <string>
#include <vector>
#include "mace/core/types.h"
namespace mace {
enum OpenCLBufferType {
CONV2D_FILTER = 0,
IN_OUT_CHANNEL = 1,
ARGUMENT = 2,
IN_OUT_HEIGHT = 3,
IN_OUT_WIDTH = 4,
WINOGRAD_FILTER = 5,
DW_CONV2D_FILTER = 6,
WEIGHT_HEIGHT = 7,
WEIGHT_WIDTH = 8,
};
class OpenCLUtil {
public:
static void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const OpenCLBufferType type,
std::vector<size_t> *image_shape,
const int wino_blk_size = 2);
static std::shared_ptr<OperatorDef> CreateTransformOpDef(
const std::string &input_name,
const std::vector<mace::index_t> &input_shape,
const std::string &output_name,
const mace::DataType dt,
const MemoryType mem_type);
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/opencl/scratch_image.h"
#include <utility>
#include <vector>
namespace mace {
ScratchImageManager::ScratchImageManager() = default;
ScratchImageManager::~ScratchImageManager() = default;
Image *ScratchImageManager::Spawn(
Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id) {
// TODO(liuqi): not optimal memory reuse strategy
int found_image_idx = -1;
int image_count = static_cast<int>(reference_count_.size());
for (int i = 0; i < image_count; ++i) {
int count = reference_count_[i];
if (count == 0 && images_.at(count)->dtype() == dt) {
auto image_shape = images_.at(count)->image_shape();
if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) {
found_image_idx = i;
break;
}
}
}
// if not found
if (found_image_idx == -1) {
reference_count_.push_back(0);
images_[image_count] =
std::move(std::unique_ptr<Image>(new Image(allocator)));
if (images_.at(image_count)->Allocate(shape, dt) !=
MaceStatus::MACE_SUCCESS) {
return nullptr;
}
found_image_idx = image_count;
VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape)
<< "<" << dt << ">";
}
reference_count_[found_image_idx] += 1;
*id = found_image_idx;
return images_.at(found_image_idx).get();
}
void ScratchImageManager::Deactive(int id) {
MACE_CHECK(reference_count_.size() > static_cast<size_t>(id)
&& reference_count_[id] > 0,
"Image id ", id, " exceed the vector size ",
reference_count_.size());
reference_count_[id] -= 1;
}
ScratchImage::ScratchImage(mace::ScratchImageManager *manager)
: manager_(manager), id_(-1) {}
ScratchImage::~ScratchImage() {
if (id_ >= 0) {
manager_->Deactive(id_);
}
}
Image* ScratchImage::Scratch(Allocator *allocator,
const std::vector<size_t> &shape,
const mace::DataType dt) {
return manager_->Spawn(allocator, shape, dt, &id_);
}
} // namespace mace
...@@ -12,39 +12,47 @@ ...@@ -12,39 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ #ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ #define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#include <memory>
#include <unordered_map>
#include <vector> #include <vector>
#include "mace/public/mace.h" #include "mace/core/buffer.h"
#include "mace/utils/utils.h"
namespace mace { namespace mace {
class OpContext; class ScratchImageManager {
class Tensor; public:
ScratchImageManager();
~ScratchImageManager();
Image *Spawn(Allocator *allocator,
const std::vector<size_t> &shape,
const DataType dt,
int *id);
namespace ops { void Deactive(int id);
class OpenCLWinogradTransformKernel { private:
public: std::unordered_map<int, std::unique_ptr<Image>> images_;
virtual MaceStatus Compute( std::vector<int> reference_count_;
OpContext *context,
const Tensor *input,
Tensor *output) = 0;
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
}; };
class OpenCLWinogradInverseTransformKernel { class ScratchImage {
public: public:
virtual MaceStatus Compute( explicit ScratchImage(ScratchImageManager *);
OpContext *context, ~ScratchImage();
const std::vector<const Tensor*> &inputs,
Tensor *output) = 0; Image *Scratch(Allocator *allocator,
MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); const std::vector<size_t> &shape,
const DataType dt);
private:
ScratchImageManager *manager_;
int id_;
}; };
} // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_
#endif // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_
...@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { ...@@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
} }
} // namespace numerical_chars } // namespace numerical_chars
enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 }; enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 };
class Tensor { class Tensor {
public: public:
...@@ -222,6 +222,25 @@ class Tensor { ...@@ -222,6 +222,25 @@ class Tensor {
return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image(); return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
} }
inline MemoryType memory_type() const {
MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty");
if (buffer_->OnHost()) {
return MemoryType::CPU_BUFFER;
} else if (typeid(*buffer_) == typeid(Image)) {
return MemoryType::GPU_IMAGE;
} else {
return MemoryType::GPU_BUFFER;
}
}
inline void set_data_format(DataFormat data_format) {
data_format_ = data_format;
}
inline DataFormat data_format() const {
return data_format_;
}
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
inline cl::Image *opencl_image() const { inline cl::Image *opencl_image() const {
MACE_CHECK(has_opencl_image(), name_, " do not have image"); MACE_CHECK(has_opencl_image(), name_, " do not have image");
...@@ -488,6 +507,7 @@ class Tensor { ...@@ -488,6 +507,7 @@ class Tensor {
int32_t zero_point_; int32_t zero_point_;
float minval_; float minval_;
float maxval_; float maxval_;
DataFormat data_format_; // used for 4D input/output tensor
MACE_DISABLE_COPY_AND_ASSIGN(Tensor); MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
}; };
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <utility> #include <utility>
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/memory_optimizer.h"
#include "mace/utils/quantize.h" #include "mace/utils/quantize.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -27,13 +28,6 @@ ...@@ -27,13 +28,6 @@
namespace mace { namespace mace {
namespace { namespace {
bool ShouldPreallocateMemoryForOp(const OperatorDef &op) {
static const std::unordered_set<std::string> reuse_buffer_ops {
"Reshape", "Identity", "Squeeze"
};
return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end();
}
bool HasQuantizeOp(const NetDef &net_def) { bool HasQuantizeOp(const NetDef &net_def) {
for (auto &op : net_def.op()) { for (auto &op : net_def.op()) {
if (op.type() == "Quantize") { if (op.type() == "Quantize") {
...@@ -48,13 +42,14 @@ Workspace::Workspace() = default; ...@@ -48,13 +42,14 @@ Workspace::Workspace() = default;
Tensor *Workspace::CreateTensor(const std::string &name, Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
DataType type) { DataType type,
bool is_weight) {
if (HasTensor(name)) { if (HasTensor(name)) {
VLOG(3) << "Tensor " << name << " already exists. Skipping."; VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else { } else {
VLOG(3) << "Creating Tensor " << name; VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type, tensor_map_[name] = std::unique_ptr<Tensor>(new Tensor(alloc, type,
false, name)); is_weight, name));
} }
return GetTensor(name); return GetTensor(name);
} }
...@@ -199,66 +194,18 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -199,66 +194,18 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
fused_buffer_ = true; fused_buffer_ = true;
} }
} }
if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) {
MaceStatus status = CreateOutputTensorBuffer(net_def, device);
if (status != MaceStatus::MACE_SUCCESS) return status;
}
if (device_type == DeviceType::CPU) {
for (const auto &op : net_def.op()) {
VLOG(2) << "Add quantize info for op: " << op.name();
MACE_CHECK(op.quantize_info().empty()
|| op.quantize_info().size() == op.output().size(),
"quantize info size must be equal to output size or empty");
for (int i = 0; i < op.quantize_info().size(); ++i) {
auto &quantize_info = op.quantize_info(i);
Tensor *tensor = GetTensor(op.output(i));
tensor->SetScale(quantize_info.scale());
tensor->SetZeroPoint(quantize_info.zero_point());
tensor->SetMinVal(quantize_info.minval());
tensor->SetMaxVal(quantize_info.maxval());
}
}
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus Workspace::PreallocateOutputTensor(
const mace::NetDef &net_def,
const mace::MemoryOptimizer *mem_optimizer,
Device *device) { Device *device) {
DeviceType device_type = device->device_type(); auto &mem_blocks = mem_optimizer->mem_blocks();
DataType dtype = DataType::DT_INVALID; for (auto &mem_block : mem_blocks) {
if (net_def.mem_arena().mem_block_size() > 0) {
// We use the data type of the first op with mem id,
// as CPU&GPU have consistent data type for each layer for now.
// As DSP may have different data output type for each op,
// we stick to the same concept.
for (auto &op : net_def.op()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type && !op.mem_id().empty()) {
const DataType op_dtype = static_cast<DataType>(
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "T", static_cast<int>(DT_FLOAT)));
if (op_dtype != DataType::DT_INVALID) {
dtype = op_dtype;
// find first valid data type, break
break;
}
}
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
}
// TODO(liyin): memory block should not have concept of type, but to be
// consistent with gpu, all memory block use float/half as unit
for (auto &mem_block : net_def.mem_arena().mem_block()) {
if (mem_block.device_type() == device_type) {
VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id() VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id()
<< ", device type: " << mem_block.device_type() << ", memory type: " << mem_block.mem_type()
<< ", memory type: " << mem_block.mem_type(); << ", size: " << mem_block.x() << "x" << mem_block.y();
if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { if (mem_block.mem_type() == MemoryType::CPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(GetCPUAllocator())); new Buffer(GetCPUAllocator()));
...@@ -270,91 +217,72 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, ...@@ -270,91 +217,72 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
std::unique_ptr<BufferBase> image_buf( std::unique_ptr<BufferBase> image_buf(
new Image(device->allocator())); new Image(device->allocator()));
MACE_RETURN_IF_ERROR(image_buf->Allocate( MACE_RETURN_IF_ERROR(image_buf->Allocate(
{mem_block.x(), mem_block.y()}, dtype)); {static_cast<size_t>(mem_block.x()),
static_cast<size_t>(mem_block.y())}, mem_block.data_type()));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(image_buf)); std::move(image_buf));
} else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) {
std::unique_ptr<BufferBase> tensor_buf( std::unique_ptr<BufferBase> tensor_buf(
new Buffer(device->allocator())); new Buffer(device->allocator()));
MACE_RETURN_IF_ERROR(tensor_buf->Allocate( MACE_RETURN_IF_ERROR(tensor_buf->Allocate(
mem_block.x() * GetEnumTypeSize(dtype) mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE));
+ MACE_EXTRA_BUFFER_PAD_SIZE));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), preallocated_allocator_.SetBuffer(mem_block.mem_id(),
std::move(tensor_buf)); std::move(tensor_buf));
} }
} }
} VLOG(1) << "Preallocate buffer to tensors";
VLOG(3) << "Preallocate buffer to tensors"; bool is_quantize_model = IsQuantizedModel(net_def);
for (auto &op : net_def.op()) { for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) {
// TODO(liuqi): refactor to add device_type to OperatorDef
const int op_device =
ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
op, "device", static_cast<int>(device_type));
if (op_device == device_type) {
if (!op.mem_id().empty()
&& ShouldPreallocateMemoryForOp(op)) {
auto mem_ids = op.mem_id();
int count = mem_ids.size();
for (int i = 0; i < count; ++i) {
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = dtype;
}
std::unique_ptr<Tensor> tensor std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first),
output_type, false, op.output(i))); tensor_mem.second.second,
if (device_type == DeviceType::GPU && tensor->has_opencl_image()) { false, tensor_mem.first));
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" if (mem_blocks[tensor_mem.second.first].mem_type()
<< " Mem: " << mem_ids[i] == MemoryType::GPU_IMAGE) {
VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< " Image shape: " << " Image shape: "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer()) << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[0] ->image_shape()[0]
<< ", " << ", "
<< dynamic_cast<Image *>(tensor->UnderlyingBuffer()) << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
->image_shape()[1]; ->image_shape()[1];
tensor->set_data_format(DataFormat::NHWC);
} else { } else {
VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" VLOG(1) << "Tensor: " << tensor_mem.first
<< " Mem: " << mem_ids[i] << " Mem: " << tensor_mem.second.first
<< " Data type: " << tensor->dtype()
<< ", Buffer size: " << tensor->UnderlyingBuffer()->size(); << ", Buffer size: " << tensor->UnderlyingBuffer()->size();
} if (mem_blocks[tensor_mem.second.first].mem_type()
tensor_map_[op.output(i)] = std::move(tensor); == MemoryType::GPU_BUFFER ||
} is_quantize_model) {
tensor->set_data_format(DataFormat::NHWC);
} else { } else {
for (int i = 0; i < op.output().size(); ++i) { tensor->set_data_format(DataFormat::NCHW);
MACE_CHECK(
op.output_type_size() == 0
|| op.output_size()
== op.output_type_size(),
"operator output size != operator output type size",
op.output_size(),
op.output_type_size());
DataType output_type;
if (i < op.output_type_size()) {
output_type = op.output_type(i);
} else {
output_type = static_cast<DataType>(ProtoArgHelper::GetOptionalArg(
op, "T", static_cast<int>(DT_FLOAT)));
} }
CreateTensor(op.output(i),
device->allocator(),
output_type);
} }
tensor_map_[tensor_mem.first] = std::move(tensor);
} }
for (int output_idx = 0; output_idx < op.output_shape_size(); // add quantize info for output tensors.
++output_idx) { if (device->device_type() == DeviceType::CPU) {
std::vector<index_t> for (const auto &op : net_def.op()) {
shape_configured(op.output_shape(output_idx).dims_size()); VLOG(2) << "Add quantize info for op: " << op.name();
for (size_t dim = 0; dim < shape_configured.size(); ++dim) { MACE_CHECK(op.quantize_info().empty()
shape_configured[dim] = op.output_shape(output_idx).dims(dim); || op.quantize_info().size() == op.output().size(),
} "quantize info size must be equal to output size or empty");
tensor_map_[op.output(output_idx)]->SetShapeConfigured( for (int i = 0; i < op.quantize_info().size(); ++i) {
shape_configured); auto &quantize_info = op.quantize_info(i);
Tensor *tensor = GetTensor(op.output(i));
tensor->SetScale(quantize_info.scale());
tensor->SetZeroPoint(quantize_info.zero_point());
tensor->SetMinVal(quantize_info.minval());
tensor->SetMaxVal(quantize_info.maxval());
} }
} }
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, ...@@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
tensor_buffer_.reset(nullptr); tensor_buffer_.reset(nullptr);
} }
void Workspace::RemoveTensor(const std::string &name) {
auto iter = tensor_map_.find(name);
if (iter != tensor_map_.end()) {
tensor_map_.erase(iter);
}
}
} // namespace mace } // namespace mace
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
namespace mace { namespace mace {
class MemoryOptimizer;
class Workspace { class Workspace {
public: public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap; typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
...@@ -36,7 +38,8 @@ class Workspace { ...@@ -36,7 +38,8 @@ class Workspace {
Tensor *CreateTensor(const std::string &name, Tensor *CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
DataType type); DataType type,
bool is_weight = false);
inline bool HasTensor(const std::string &name) const { inline bool HasTensor(const std::string &name) const {
return tensor_map_.find(name) != tensor_map_.end(); return tensor_map_.find(name) != tensor_map_.end();
...@@ -52,12 +55,19 @@ class Workspace { ...@@ -52,12 +55,19 @@ class Workspace {
Device *device, Device *device,
const unsigned char *model_data); const unsigned char *model_data);
MaceStatus PreallocateOutputTensor(const NetDef &net_def,
const MemoryOptimizer *mem_optimizer,
Device *device);
void RemoveUnusedBuffer(); void RemoveUnusedBuffer();
void RemoveAndReloadBuffer(const NetDef &net_def, void RemoveAndReloadBuffer(const NetDef &net_def,
const unsigned char *model_data, const unsigned char *model_data,
Allocator *alloc); Allocator *alloc);
void RemoveTensor(const std::string &name);
private: private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
Device *device); Device *device);
......
...@@ -20,9 +20,11 @@ ...@@ -20,9 +20,11 @@
#include <memory> #include <memory>
#include "mace/core/net.h"
#include "mace/core/device_context.h" #include "mace/core/device_context.h"
#include "mace/core/memory_optimizer.h"
#include "mace/core/net.h"
#include "mace/ops/ops_registry.h" #include "mace/ops/ops_registry.h"
#include "mace/ops/transpose.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
// Check OpenCL avaliable // Check OpenCL avaliable
auto runtime = device->opencl_runtime(); auto runtime = device->opencl_runtime();
if (!runtime->is_opencl_avaliable()) { if (!runtime->is_opencl_avaliable()) {
LOG(WARNING) << "The device does not support OpenCL";
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
} }
...@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { ...@@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) {
const MemoryType mem_type = static_cast<MemoryType>(mem_type_i); const MemoryType mem_type = static_cast<MemoryType>(mem_type_i);
runtime->set_mem_type(mem_type); runtime->set_mem_type(mem_type);
if (mem_type == MemoryType::GPU_IMAGE) {
if (!runtime->IsImageSupport()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
auto opencl_max_image_size = runtime->GetMaxImage2DSize();
if (opencl_max_image_size.empty()) {
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
const std::vector<int64_t> net_max_image_size =
ProtoArgHelper::GetRepeatedArgs<NetDef, int64_t>(
*net_def, "opencl_max_image_size", {0, 0});
if (static_cast<uint64_t>(net_max_image_size[0]) > opencl_max_image_size[0]
|| static_cast<uint64_t>(net_max_image_size[1])
> opencl_max_image_size[1]) {
LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size)
<< " vs " << MakeString(net_max_image_size);
return MaceStatus::MACE_OUT_OF_RESOURCES;
}
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -288,14 +269,17 @@ class MaceTensor::Impl { ...@@ -288,14 +269,17 @@ class MaceTensor::Impl {
public: public:
std::vector<int64_t> shape; std::vector<int64_t> shape;
std::shared_ptr<float> data; std::shared_ptr<float> data;
DataFormat format;
}; };
MaceTensor::MaceTensor(const std::vector<int64_t> &shape, MaceTensor::MaceTensor(const std::vector<int64_t> &shape,
std::shared_ptr<float> data) { std::shared_ptr<float> data,
const DataFormat format) {
MACE_CHECK_NOTNULL(data.get()); MACE_CHECK_NOTNULL(data.get());
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = shape; impl_->shape = shape;
impl_->data = data; impl_->data = data;
impl_->format = format;
} }
MaceTensor::MaceTensor() { MaceTensor::MaceTensor() {
...@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) { ...@@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
} }
MaceTensor::MaceTensor(const MaceTensor &&other) { MaceTensor::MaceTensor(const MaceTensor &&other) {
impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl()); impl_ = std::unique_ptr<MaceTensor::Impl>(new MaceTensor::Impl());
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
} }
MaceTensor &MaceTensor::operator=(const MaceTensor &other) { MaceTensor &MaceTensor::operator=(const MaceTensor &other) {
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
return *this; return *this;
} }
MaceTensor &MaceTensor::operator=(const MaceTensor &&other) { MaceTensor &MaceTensor::operator=(const MaceTensor &&other) {
impl_->shape = other.shape(); impl_->shape = other.shape();
impl_->data = other.data(); impl_->data = other.data();
impl_->format = other.data_format();
return *this; return *this;
} }
...@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; } ...@@ -334,6 +322,10 @@ const std::shared_ptr<float> MaceTensor::data() const { return impl_->data; }
std::shared_ptr<float> MaceTensor::data() { return impl_->data; } std::shared_ptr<float> MaceTensor::data() { return impl_->data; }
DataFormat MaceTensor::data_format() const {
return impl_->format;
}
// Mace Engine // Mace Engine
class MaceEngine::Impl { class MaceEngine::Impl {
public: public:
...@@ -355,6 +347,14 @@ class MaceEngine::Impl { ...@@ -355,6 +347,14 @@ class MaceEngine::Impl {
std::map<std::string, MaceTensor> *outputs, std::map<std::string, MaceTensor> *outputs,
RunMetadata *run_metadata); RunMetadata *run_metadata);
private:
MaceStatus TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor);
MaceStatus TransposeOutput(const Tensor *output_tensor,
std::pair<const std::string, MaceTensor> *output);
private: private:
const unsigned char *model_data_; const unsigned char *model_data_;
size_t model_data_size_; size_t model_data_size_;
...@@ -363,11 +363,12 @@ class MaceEngine::Impl { ...@@ -363,11 +363,12 @@ class MaceEngine::Impl {
std::unique_ptr<Device> device_; std::unique_ptr<Device> device_;
std::unique_ptr<Workspace> ws_; std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
std::map<std::string, mace::InputInfo> input_info_map_; bool is_quantized_model_;
std::map<std::string, mace::OutputInfo> output_info_map_;
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
std::unique_ptr<HexagonControlWrapper> hexagon_controller_; std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif #endif
std::map<std::string, mace::InputInfo> input_info_map_;
std::map<std::string, mace::OutputInfo> output_info_map_;
MACE_DISABLE_COPY_AND_ASSIGN(Impl); MACE_DISABLE_COPY_AND_ASSIGN(Impl);
}; };
...@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) ...@@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
device_type_(config.impl_->device_type()), device_type_(config.impl_->device_type()),
device_(nullptr), device_(nullptr),
ws_(new Workspace()), ws_(new Workspace()),
net_(nullptr) net_(nullptr),
is_quantized_model_(false)
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
, hexagon_controller_(nullptr) , hexagon_controller_(nullptr)
#endif #endif
...@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init(
MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get())); MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get()));
} }
#endif #endif
// mark quantized model flag
is_quantized_model_ = IsQuantizedModel(*net_def);
// Get input and output information. // Get input and output information.
for (auto &input_info : net_def->input_info()) { for (auto &input_info : net_def->input_info()) {
input_info_map_[input_info.name()] = input_info; input_info_map_[input_info.name()] = input_info;
...@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's inputs: " << "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_)); << MakeString(MapKeys(input_info_map_));
} }
ws_->CreateTensor(MakeString("mace_input_node_", input_name), ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT);
device_->allocator(), DT_FLOAT);
} }
for (auto output_name : output_nodes) { for (auto output_name : output_nodes) {
if (output_info_map_.find(output_name) == output_info_map_.end()) { if (output_info_map_.find(output_name) == output_info_map_.end()) {
...@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init(
<< "' does not belong to model's outputs " << "' does not belong to model's outputs "
<< MakeString(MapKeys(output_info_map_)); << MakeString(MapKeys(output_info_map_));
} }
ws_->CreateTensor(MakeString("mace_output_node_", output_name),
device_->allocator(), DT_FLOAT);
} }
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
if (device_type_ == HEXAGON) { if (device_type_ == HEXAGON) {
...@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init(
device_.get(), device_.get(),
model_data)); model_data));
MemoryOptimizer mem_optimizer;
// Init model // Init model
auto net = std::unique_ptr<NetBase>(new SerialNet(
op_registry_.get(),
net_def,
ws_.get(),
device_.get(),
NetMode::INIT));
MACE_RETURN_IF_ERROR(net->Init());
MACE_RETURN_IF_ERROR(net->Run());
net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(), net_ = std::unique_ptr<NetBase>(new SerialNet(op_registry_.get(),
net_def, net_def,
ws_.get(), ws_.get(),
device_.get(),
&mem_optimizer));
// Preallocate all output tensors of ops
MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
&mem_optimizer,
device_.get())); device_.get()));
MACE_RETURN_IF_ERROR(net_->Init()); MACE_RETURN_IF_ERROR(net_->Init());
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
...@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() { ...@@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() {
#endif #endif
} }
MaceStatus MaceEngine::Impl::TransposeInput(
const std::pair<const std::string, MaceTensor> &input,
Tensor *input_tensor) {
if (device_->device_type() == DeviceType::CPU &&
input.second.shape().size() == 4 &&
input.second.data_format() == NHWC &&
!is_quantized_model_) {
VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW";
input_tensor->set_data_format(DataFormat::NCHW);
std::vector<int> dst_dims = {0, 3, 1, 2};
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else if (
(is_quantized_model_ || device_->device_type() == DeviceType::GPU) &&
input.second.shape().size() == 4 &&
input.second.data_format() == DataFormat::NCHW) {
VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
input_tensor->set_data_format(DataFormat::NHWC);
std::vector<index_t> output_shape =
TransposeShape<int64_t, index_t>(input.second.shape(), dst_dims);
MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
return ops::Transpose(input.second.data().get(),
input.second.shape(),
dst_dims,
input_data);
} else {
input_tensor->set_data_format(input.second.data_format());
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
}
MaceStatus MaceEngine::Impl::TransposeOutput(
const mace::Tensor *output_tensor,
std::pair<const std::string, mace::MaceTensor> *output) {
// save output
if (output_tensor != nullptr && output->second.data() != nullptr) {
if (device_->device_type() == DeviceType::CPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
MACE_CHECK(output_tensor->data_format() == NCHW);
VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC";
std::vector<int> dst_dims = {0, 2, 3, 1};
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else if (device_->device_type() == DeviceType::GPU &&
output->second.shape().size() == 4 &&
output->second.data_format() != output_tensor->data_format()) {
VLOG(1) << "Transform output " << output->first << " from "
<< output_tensor->data_format() << " to "
<< output->second.data_format();
std::vector<int> dst_dims = {0, 3, 1, 2};
if (output_tensor->data_format() == NCHW) {
dst_dims = {0, 2, 3, 1};
}
std::vector<index_t> shape =
TransposeShape<index_t, index_t>(output_tensor->shape(),
dst_dims);
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
Tensor::MappingGuard output_guard(output_tensor);
const float *output_data = output_tensor->data<float>();
return ops::Transpose(output_data,
output_tensor->shape(),
dst_dims,
output->second.data().get());
} else {
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output->second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(shape) << " != "
<< MakeString<int64_t>(output->second.shape());
std::memcpy(output->second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
return MaceStatus::MACE_SUCCESS;
}
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
}
MaceStatus MaceEngine::Impl::Run( MaceStatus MaceEngine::Impl::Run(
const std::map<std::string, MaceTensor> &inputs, const std::map<std::string, MaceTensor> &inputs,
std::map<std::string, MaceTensor> *outputs, std::map<std::string, MaceTensor> *outputs,
...@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's inputs: " << "' does not belong to model's inputs: "
<< MakeString(MapKeys(input_info_map_)); << MakeString(MapKeys(input_info_map_));
} }
Tensor *input_tensor = Tensor *input_tensor = ws_->GetTensor(input.first);
ws_->GetTensor(MakeString("mace_input_node_", input.first)); MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor));
MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape()));
{
Tensor::MappingGuard input_guard(input_tensor);
float *input_data = input_tensor->mutable_data<float>();
memcpy(input_data, input.second.data().get(),
input_tensor->size() * sizeof(float));
}
input_tensors.push_back(input_tensor); input_tensors.push_back(input_tensor);
} }
for (auto &output : *outputs) { for (auto &output : *outputs) {
...@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run(
<< "' does not belong to model's outputs: " << "' does not belong to model's outputs: "
<< MakeString(MapKeys(output_info_map_)); << MakeString(MapKeys(output_info_map_));
} }
Tensor *output_tensor = Tensor *output_tensor = ws_->GetTensor(output.first);
ws_->GetTensor(MakeString("mace_output_node_", output.first));
output_tensors.push_back(output_tensor); output_tensors.push_back(output_tensor);
} }
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
...@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run( ...@@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run(
} }
#endif #endif
for (auto &output : *outputs) { for (auto &output : *outputs) {
Tensor *output_tensor = Tensor *output_tensor = ws_->GetTensor(output.first);
ws_->GetTensor(MakeString("mace_output_node_", output.first));
// save output // save output
if (output_tensor != nullptr && output.second.data() != nullptr) { MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output));
Tensor::MappingGuard output_guard(output_tensor);
auto shape = output_tensor->shape();
int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
std::multiplies<int64_t>());
MACE_CHECK(shape == output.second.shape())
<< "Output shape mismatch: "
<< MakeString<int64_t>(output.second.shape())
<< " != " << MakeString<int64_t>(shape);
std::memcpy(output.second.data().get(), output_tensor->data<float>(),
output_size * sizeof(float));
} else {
return MaceStatus::MACE_INVALID_ARGS;
}
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
......
...@@ -14,7 +14,6 @@ mace { ...@@ -14,7 +14,6 @@ mace {
*mace*NetDef*; *mace*NetDef*;
*mace*MemoryType*; *mace*MemoryType*;
*mace*DataType*; *mace*DataType*;
*mace*MemoryArena*;
*mace*InputInfo*; *mace*InputInfo*;
*mace*OutputInfo*; *mace*OutputInfo*;
*mace*OutputShape*; *mace*OutputShape*;
......
...@@ -30,10 +30,8 @@ cc_library( ...@@ -30,10 +30,8 @@ cc_library(
"arm/*_test.cc", "arm/*_test.cc",
"ops_registry.cc", "ops_registry.cc",
"ops_test_util.cc", "ops_test_util.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc", "buffer_transform.cc",
"lstm_cell.cc", "lstm_cell.cc",
"winograd_transform.cc",
"quantize.cc", "quantize.cc",
], ],
) + if_opencl_enabled(glob( ) + if_opencl_enabled(glob(
...@@ -41,10 +39,8 @@ cc_library( ...@@ -41,10 +39,8 @@ cc_library(
"opencl/*.cc", "opencl/*.cc",
"opencl/image/*.cc", "opencl/image/*.cc",
"opencl/buffer/*.cc", "opencl/buffer/*.cc",
"buffer_inverse_transform.cc",
"buffer_transform.cc", "buffer_transform.cc",
"lstm_cell.cc", "lstm_cell.cc",
"winograd_transform.cc",
], ],
exclude = [ exclude = [
"opencl/*_test.cc", "opencl/*_test.cc",
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/activation.h" #include "mace/ops/opencl/image/activation.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation { ...@@ -79,12 +80,19 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
"NOOP")); "NOOP"));
auto relux_max_limit = static_cast<T>( auto relux_max_limit = static_cast<T>(
Operation::GetOptionalArg<float>("max_limit", 0.0f)); Operation::GetOptionalArg<float>("max_limit", 0.0f));
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset( kernel_.reset(
new opencl::image::ActivationKernel<T>(type, relux_max_limit)); new opencl::image::ActivationKernel<T>(type, relux_max_limit));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (type == ActivationType::PRELU) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (D == DeviceType::CPU) {
OpDefBuilder("Activation", "ReluBM") OpDefBuilder("Activation", "ReluBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddStringArg("activation", "RELU") .AddStringArg("activation", "RELU")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0)
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "ReluxBM") OpDefBuilder("Activation", "ReluxBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddStringArg("activation", "RELUX") .AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6.0) .AddFloatArg("max_limit", 6.0)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, float>("Alpha", {channels}); net.AddRandomInput<D, T>("Alpha", {channels}, true);
if (D == DeviceType::CPU) {
OpDefBuilder("Activation", "PreluBM") OpDefBuilder("Activation", "PreluBM")
.Input("Input") .Input("Input")
.Input("Alpha") .Input("Alpha")
.Output("Output") .Output("Output")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluBM")
.Input("InputImage")
.Input("AlphaImage")
.Output("Output")
.AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { ...@@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "TanhBM") OpDefBuilder("Activation", "TanhBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddStringArg("activation", "TANH") .AddStringArg("activation", "TANH")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
...@@ -310,27 +262,17 @@ void SigmoidBenchmark( ...@@ -310,27 +262,17 @@ void SigmoidBenchmark(
// Add input data // Add input data
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else { } else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidBM")
.Input("InputImage")
.Output("Output")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Activation", "SigmoidBM") OpDefBuilder("Activation", "SigmoidBM")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddStringArg("activation", "SIGMOID") .AddStringArg("activation", "SIGMOID")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -30,23 +30,6 @@ void TestSimpleRelu() { ...@@ -30,23 +30,6 @@ void TestSimpleRelu() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -55,7 +38,6 @@ void TestSimpleRelu() { ...@@ -55,7 +38,6 @@ void TestSimpleRelu() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -78,23 +60,6 @@ void TestUnalignedSimpleRelu() { ...@@ -78,23 +60,6 @@ void TestUnalignedSimpleRelu() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); net.AddInputFromArray<D, float>("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELU")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluTest") OpDefBuilder("Activation", "ReluTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -103,7 +68,6 @@ void TestUnalignedSimpleRelu() { ...@@ -103,7 +68,6 @@ void TestUnalignedSimpleRelu() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); auto expected = net.CreateTensor<float>({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5});
...@@ -129,24 +93,6 @@ void TestSimpleRelux() { ...@@ -129,24 +93,6 @@ void TestSimpleRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -156,7 +102,6 @@ void TestSimpleRelux() { ...@@ -156,7 +102,6 @@ void TestSimpleRelux() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -179,24 +124,6 @@ void TestSimpleReluRelux() { ...@@ -179,24 +124,6 @@ void TestSimpleReluRelux() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "ReluxTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "RELUX")
.AddFloatArg("max_limit", 6)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "ReluxTest") OpDefBuilder("Activation", "ReluxTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -206,7 +133,6 @@ void TestSimpleReluRelux() { ...@@ -206,7 +133,6 @@ void TestSimpleReluRelux() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0});
...@@ -232,45 +158,36 @@ void TestSimplePrelu() { ...@@ -232,45 +158,36 @@ void TestSimplePrelu() {
net.AddInputFromArray<D, float>( net.AddInputFromArray<D, float>(
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}); net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0}, true);
if (D == DeviceType::GPU) { if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("InputImage") .Input("Input")
.Input("AlphaImage") .Input("Alpha")
.Output("OutputImage") .Output("Output")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else { } else {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Activation", "PreluTest") OpDefBuilder("Activation", "PreluTest")
.Input("Input") .Input("InputNCHW")
.Input("Alpha") .Input("Alpha")
.Output("Output") .Output("OutputNCHW")
.AddStringArg("activation", "PRELU") .AddStringArg("activation", "PRELU")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} }
if (D == DeviceType::CPU) {
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
{-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} }
} // namespace } // namespace
...@@ -290,23 +207,6 @@ void TestSimpleTanh() { ...@@ -290,23 +207,6 @@ void TestSimpleTanh() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "TanhTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "TANH")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "TanhTest") OpDefBuilder("Activation", "TanhTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -315,7 +215,6 @@ void TestSimpleTanh() { ...@@ -315,7 +215,6 @@ void TestSimpleTanh() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
...@@ -343,23 +242,6 @@ void TestSimpleSigmoid() { ...@@ -343,23 +242,6 @@ void TestSimpleSigmoid() {
"Input", {2, 2, 2, 2}, "Input", {2, 2, 2, 2},
{-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Activation", "SigmoidTest")
.Input("InputImage")
.Output("OutputImage")
.AddStringArg("activation", "SIGMOID")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Activation", "SigmoidTest") OpDefBuilder("Activation", "SigmoidTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
...@@ -368,7 +250,6 @@ void TestSimpleSigmoid() { ...@@ -368,7 +250,6 @@ void TestSimpleSigmoid() {
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{2, 2, 2, 2}, {2, 2, 2, 2},
......
...@@ -32,20 +32,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -32,20 +32,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c}); net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
} }
if (D == DeviceType::GPU) {
for (int i = 0; i < inputs; ++i) {
BufferToImage<D, T>(&net, MakeString("Input", i).c_str(),
MakeString("InputImage", i).c_str(),
ops::BufferType::IN_OUT_CHANNEL);
}
OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("InputImage", i).c_str());
}
op_def_builder.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder op_def_builder("AddN", "AddNBM"); OpDefBuilder op_def_builder("AddN", "AddNBM");
for (int i = 0; i < inputs; ++i) { for (int i = 0; i < inputs; ++i) {
op_def_builder.Input(MakeString("Input", i).c_str()); op_def_builder.Input(MakeString("Input", i).c_str());
...@@ -53,7 +39,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { ...@@ -53,7 +39,6 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
op_def_builder.Output("Output") op_def_builder.Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -62,29 +62,6 @@ void SimpleAdd3() { ...@@ -62,29 +62,6 @@ void SimpleAdd3() {
net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1}, net.AddInputFromArray<D, float>("Input3", {1, 2, 3, 1},
{-0.1582, 2, 3, 4, 5, 6}); {-0.1582, 2, 3, 4, 5, 6});
const int input_num = 4;
if (D == DeviceType::GPU) {
// run on gpu
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
MakeString("InputImage", i),
ops::BufferType::IN_OUT_CHANNEL);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("AddN", "AddNTest") OpDefBuilder("AddN", "AddNTest")
.Input("Input0") .Input("Input0")
.Input("Input1") .Input("Input1")
...@@ -94,7 +71,6 @@ void SimpleAdd3() { ...@@ -94,7 +71,6 @@ void SimpleAdd3() {
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
}
auto expected = auto expected =
net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); net.CreateTensor<float>({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24});
...@@ -138,28 +114,10 @@ void RandomTest() { ...@@ -138,28 +114,10 @@ void RandomTest() {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// run on gpu // run on device
for (int i = 0; i < input_num; ++i) {
BufferToImage<D, half>(&net, MakeString("Input", i),
MakeString("InputImage", i),
ops::BufferType::IN_OUT_CHANNEL);
}
auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
for (int i = 0; i < input_num; ++i) {
op_def_cl.Input(MakeString("InputImage", i));
}
op_def_cl.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D); net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2,
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-2,
1e-2); 1e-2);
} }
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/activation.h" #include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/batch_norm.h" #include "mace/ops/opencl/image/batch_norm.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation { ...@@ -147,12 +148,27 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
ActivationType activation = ops::StringToActivationType( ActivationType activation = ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", "NOOP")); Operation::GetOptionalArg<std::string>("activation", "NOOP"));
float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f); float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BatchNormKernel<T>( kernel_.reset(new opencl::image::BatchNormKernel<T>(
epsilon, activation, relux_max_limit)); epsilon, activation, relux_max_limit));
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
// Transform filters
int input_size = operator_def_->input_size();
for (int i = 1; i < input_size; ++i) {
const Tensor *input_tensor = context->workspace()->GetTensor(
operator_def_->input(i));
MACE_CHECK(input_tensor != nullptr);
MACE_CHECK(TransformFilter<T>(
context,
operator_def_.get(),
i,
OpenCLBufferType::ARGUMENT,
mem_type) == MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
bool not_folded = this->InputSize() == 5; bool not_folded = this->InputSize() == 5;
......
...@@ -36,12 +36,11 @@ void BatchNorm( ...@@ -36,12 +36,11 @@ void BatchNorm(
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, T>("Scale", {channels}); net.AddRandomInput<D, T>("Scale", {channels}, true);
net.AddRandomInput<D, T>("Offset", {channels}); net.AddRandomInput<D, T>("Offset", {channels}, true);
net.AddRandomInput<D, T>("Mean", {channels}); net.AddRandomInput<D, T>("Mean", {channels}, true);
net.AddRandomInput<D, T>("Var", {channels}, true); net.AddRandomInput<D, T>("Var", {channels}, true, true);
if (D == DeviceType::CPU) {
OpDefBuilder("BatchNorm", "BatchNormBM") OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("Input") .Input("Input")
.Input("Scale") .Input("Scale")
...@@ -50,30 +49,8 @@ void BatchNorm( ...@@ -50,30 +49,8 @@ void BatchNorm(
.Input("Var") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("Output") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormBM")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// tuning // tuning
setenv("MACE_TUNING", "1", 1); setenv("MACE_TUNING", "1", 1);
......
...@@ -28,10 +28,10 @@ void Simple() { ...@@ -28,10 +28,10 @@ void Simple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}); net.AddInputFromArray<D, float>("Scale", {1}, {4.0f}, true);
net.AddInputFromArray<D, float>("Offset", {1}, {2.0}); net.AddInputFromArray<D, float>("Offset", {1}, {2.0}, true);
net.AddInputFromArray<D, float>("Mean", {1}, {10}); net.AddInputFromArray<D, float>("Mean", {1}, {10}, true);
net.AddInputFromArray<D, float>("Var", {1}, {11.67f}); net.AddInputFromArray<D, float>("Var", {1}, {11.67f}, true);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
...@@ -49,32 +49,17 @@ void Simple() { ...@@ -49,32 +49,17 @@ void Simple() {
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<D, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} }
// Check // Check
...@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Tuning // Tuning
...@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { ...@@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput",
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4); 1e-5, 1e-4);
} }
...@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { ...@@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2); 1e-1, 1e-2);
} }
...@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, float>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-3) .AddFloatArg("epsilon", 1e-3)
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// tuning // tuning
...@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { ...@@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-5, 1e-4); 1e-5, 1e-4);
} }
...@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}); net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on opencl
BufferToImage<DeviceType::GPU, half>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, half>(&net, "Scale", "ScaleImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Mean", "MeanImage",
ops::BufferType::ARGUMENT);
BufferToImage<DeviceType::GPU, half>(&net, "Var", "VarImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest") OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage") .Input("Input")
.Input("ScaleImage") .Input("Scale")
.Input("OffsetImage") .Input("Offset")
.Input("MeanImage") .Input("Mean")
.Input("VarImage") .Input("Var")
.AddFloatArg("epsilon", 1e-1) .AddFloatArg("epsilon", 1e-1)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF)) .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ...@@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync(); net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"),
1e-1, 1e-2); 1e-1, 1e-2);
} }
......
...@@ -32,23 +32,13 @@ void BMBatchToSpace( ...@@ -32,23 +32,13 @@ void BMBatchToSpace(
net.AddRandomInput<D, float>("Input", {batch, height, width, channels}); net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} }
if (D == DeviceType::CPU) {
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("Input") .Input("Input")
.Output("Output") .Output("Output")
.AddIntsArg("crops", {0, 0, 0, 0}) .AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg}) .AddIntsArg("block_shape", {arg, arg})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntsArg("crops", {0, 0, 0, 0})
.AddIntsArg("block_shape", {arg, arg})
.Finalize(net.NewOperatorDef());
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.RunOp(D); net.RunOp(D);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/activation.h" #include "mace/ops/activation.h"
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/bias_add.h" #include "mace/ops/opencl/image/bias_add.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
...@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation { ...@@ -99,11 +100,16 @@ class BiasAddOp<DeviceType::GPU, T> : public Operation {
: Operation(context), : Operation(context),
data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>( data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
"data_format", NHWC))) { "data_format", NHWC))) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::BiasAddKernel<T>); kernel_.reset(new opencl::image::BiasAddKernel<T>);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
......
...@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { ...@@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
OpsTestNet net; OpsTestNet net;
// Add input data // Add input data
DataFormat data_format = NHWC;
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
data_format = NCHW;
net.AddRandomInput<D, T>("Input", {batch, channels, height, width}); net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
net.AddRandomInput<D, T>("Input", {batch, height, width, channels}); net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
net.AddRandomInput<D, T>("Bias", {channels}, true); net.AddRandomInput<D, T>("Bias", {channels}, true, true);
if (D == DeviceType::CPU) {
OpDefBuilder("BiasAdd", "BiasAddBM") OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("Input") .Input("Input")
.Input("Bias") .Input("Bias")
.AddIntArg("data_format", NCHW) .AddIntArg("data_format", data_format)
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddBM")
.Input("InputImage")
.Input("BiasImage")
.Output("Output") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -28,7 +28,7 @@ void BiasAddSimple() { ...@@ -28,7 +28,7 @@ void BiasAddSimple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1}, net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}); net.AddInputFromArray<D, float>("Bias", {1}, {0.5f}, true);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
...@@ -44,22 +44,13 @@ void BiasAddSimple() { ...@@ -44,22 +44,13 @@ void BiasAddSimple() {
net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW, net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW", NCHW,
"Output", NHWC); "Output", NHWC);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
...@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { ...@@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on gpu
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
...@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, float>("Input", net.AddRandomInput<DeviceType::GPU, float>("Input",
{batch, height, width, channels}); {batch, height, width, channels});
net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true); net.AddRandomInput<DeviceType::GPU, float>("Bias", {channels}, true, true);
net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW", net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
NCHW); NCHW);
...@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { ...@@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
auto expected = net.CreateTensor<float>(); auto expected = net.CreateTensor<float>();
expected->Copy(*net.GetOutput("Output")); expected->Copy(*net.GetOutput("Output"));
// Run on opencl // Run on gpu
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, float>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("BiasAdd", "BiasAddTest") OpDefBuilder("BiasAdd", "BiasAddTest")
.Input("InputImage") .Input("Input")
.Input("BiasImage") .Input("Bias")
.Output("OutputImage") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run on opencl // Run on opencl
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
net.Sync();
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "OPENCLOutput", ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
ops::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("OPENCLOutput"), 1e-5);
} }
} // namespace test } // namespace test
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_inverse_transform.h"
#include "mace/ops/opencl/image/image_to_buffer.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BufferInverseTransformOp;
template <typename T>
class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
public:
explicit BufferInverseTransformOp(OpConstructContext *context)
: Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
if (context->device()->opencl_runtime()->UseImageMemory()) {
kernel_.reset(new opencl::image::ImageToBuffer<T>);
} else {
kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
}
}
MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0);
Tensor *output = this->Output(0);
ops::BufferType type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER)));
return kernel_->Compute(context, input, type,
wino_blk_size_, output);
}
private:
const int wino_blk_size_;
std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
};
void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, float);
MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
BufferInverseTransformOp, DeviceType::GPU, half);
}
} // namespace ops
} // namespace mace
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters, ...@@ -28,26 +29,36 @@ void FilterBufferToImage(int iters,
mace::testing::StopTiming(); mace::testing::StopTiming();
OpsTestNet net; OpsTestNet net;
OpContext context(net.ws(),
OpTestContext::Get()->GetDevice(DeviceType::GPU));
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", net.AddRandomInput<D, T>("Input",
{out_channel, in_channel, height, width}); {out_channel, in_channel, height, width});
// Create output
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpDefBuilder("BufferToImage", "BufferToImageBM") auto transform_func = [&]() {
.Input("Input") OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Output("Output") .Transform(&context,
.Finalize(net.NewOperatorDef()); net.ws()->GetTensor("Input"),
OpenCLBufferType::IN_OUT_CHANNEL,
MemoryType::GPU_IMAGE,
0,
b2i_output);
};
// Warm-up // Warm-up
net.Setup(D); net.Setup(D);
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
net.Run(); transform_func();
} }
net.Sync(); net.Sync();
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.Run(); transform_func();
} }
net.Sync(); net.Sync();
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
#include "mace/ops/opencl/buffer_transformer.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -21,31 +22,27 @@ namespace test { ...@@ -21,31 +22,27 @@ namespace test {
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestBidirectionTransform(const int type, void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, T>("Input", input_shape); net.AddRandomInput<D, T>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
net.RunOp(D); .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") // Inverse Transform
.Input("B2IOutput") Tensor *i2b_output = net.ws()->CreateTensor(
.Output("I2BOutput") "I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
.AddIntArg("buffer_type", type) OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.AddIntArg("T", DataTypeToEnum<T>::value) .Transform(&context, b2i_output,
.Finalize(net.NewOperatorDef()); type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Run
net.RunOp(D);
// Check // Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type, ...@@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgSmall) { TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {1}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{1});
} }
TEST(BufferToImageTest, ArgHalfSmall) { TEST(BufferToImageTest, ArgHalfSmall) {
TestBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, half>(OpenCLBufferType::ARGUMENT,
{11});
} }
TEST(BufferToImageTest, ArgMedium) { TEST(BufferToImageTest, ArgMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {11}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{11});
} }
TEST(BufferToImageTest, ArgLarge) { TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::ARGUMENT, {256}); TestBidirectionTransform<DeviceType::GPU, float>(OpenCLBufferType::ARGUMENT,
{256});
} }
TEST(BufferToImageTest, InputSmallSingleChannel) { TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{1, 2, 3, 1}); OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1});
} }
TEST(BufferToImageTest, InputSmallMultipleChannel) { TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{1, 2, 3, 3}); OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3});
} }
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 2, 3, 3}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3});
} }
TEST(BufferToImageTest, InputMedium) { TEST(BufferToImageTest, InputMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 13, 17, 128}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128});
} }
TEST(BufferToImageTest, InputLarge) { TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::IN_OUT_CHANNEL, TestBidirectionTransform<DeviceType::GPU, float>(
{3, 64, 64, 256}); OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256});
} }
TEST(BufferToImageTest, Filter1x1Small) { TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{5, 3, 1, 1}); {5, 3, 1, 1});
} }
TEST(BufferToImageTest, Filter1x1Medium) { TEST(BufferToImageTest, Filter1x1Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{13, 17, 1, 1}); {13, 17, 1, 1});
} }
TEST(BufferToImageTest, Filter1x1Large) { TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{512, 128, 1, 1}); {512, 128, 1, 1});
} }
TEST(BufferToImageTest, Filter3x3Small) { TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{3, 5, 3, 3}); {3, 5, 3, 3});
} }
TEST(BufferToImageTest, Filter3x3Medium) { TEST(BufferToImageTest, Filter3x3Medium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{17, 13, 3, 3}); {17, 13, 3, 3});
} }
TEST(BufferToImageTest, Filter3x3Large) { TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::CONV2D_FILTER, TestBidirectionTransform<DeviceType::GPU, float>(CONV2D_FILTER,
{256, 128, 3, 3}); {256, 128, 3, 3});
} }
TEST(BufferToImageTest, WeightWidthSmall) { TEST(BufferToImageTest, WeightWidthSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{1, 3, 3, 3}); {1, 3, 3, 3});
} }
TEST(BufferToImageTest, WeightWidthMedium) { TEST(BufferToImageTest, WeightWidthMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{11, 13, 13, 17}); {11, 13, 13, 17});
} }
TEST(BufferToImageTest, WeightWidthLarge) { TEST(BufferToImageTest, WeightWidthLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_WIDTH, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_WIDTH,
{64, 64, 11, 13}); {64, 64, 11, 13});
} }
TEST(BufferToImageTest, WeightHeightSmall) { TEST(BufferToImageTest, WeightHeightSmall) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{2, 1, 1, 1}); {2, 1, 1, 1});
} }
TEST(BufferToImageTest, WeightHeightMedium) { TEST(BufferToImageTest, WeightHeightMedium) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{11, 13, 13, 17}); {11, 13, 13, 17});
} }
TEST(BufferToImageTest, WeightHeightLarge) { TEST(BufferToImageTest, WeightHeightLarge) {
TestBidirectionTransform<DeviceType::GPU, float>(ops::WEIGHT_HEIGHT, TestBidirectionTransform<DeviceType::GPU, float>(
OpenCLBufferType::WEIGHT_HEIGHT,
{64, 16, 11, 13}); {64, 16, 11, 13});
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestDiffTypeBidirectionTransform(const int type, void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<D, float>("Input", input_shape); net.AddRandomInput<D, float>("Input", input_shape);
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
net.RunOp(D); .Transform(&context, net.ws()->GetTensor("Input"),
type, MemoryType::GPU_IMAGE, 0, b2i_output);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run // Inverse Transform
net.RunOp(D); Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DT_FLOAT);
OpenCLBufferTransformer<float>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check // Check
ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type, ...@@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type,
} // namespace } // namespace
TEST(BufferToImageTest, ArgFloatToHalfSmall) { TEST(BufferToImageTest, ArgFloatToHalfSmall) {
TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, TestDiffTypeBidirectionTransform<DeviceType::GPU, half>(
OpenCLBufferType::ARGUMENT,
{11}); {11});
} }
namespace { namespace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
void TestStringHalfBidirectionTransform(const int type, void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
const unsigned char *input_data) { const unsigned char *input_data) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data
const half *h_data = reinterpret_cast<const half *>(input_data); const half *h_data = reinterpret_cast<const half *>(input_data);
net.AddInputFromArray<D, half>("Input", input_shape, net.AddInputFromArray<D, half>("Input", input_shape,
std::vector<half>(h_data, h_data + 2)); std::vector<half>(h_data, h_data + 2));
Tensor *b2i_output = net.ws()->CreateTensor(
"B2IOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
// Run // Transform
net.RunOp(D); OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE)
.Transform(&context, net.ws()->GetTensor("Input"),
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") type, MemoryType::GPU_IMAGE, 0, b2i_output);
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Run // Inverse Transform
net.RunOp(D); Tensor *i2b_output = net.ws()->CreateTensor(
"I2BOutput", context.device()->allocator(), DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER)
.Transform(&context, b2i_output,
type, MemoryType::GPU_BUFFER, 0, i2b_output);
// Check // Check
ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
...@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { ...@@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const unsigned char input_data[] = { const unsigned char input_data[] = {
0xCD, 0x3C, 0x33, 0x40, 0xCD, 0x3C, 0x33, 0x40,
}; };
TestStringHalfBidirectionTransform<DeviceType::GPU, half>(ops::ARGUMENT, TestStringHalfBidirectionTransform<DeviceType::GPU, half>(
{2}, input_data); OpenCLBufferType::ARGUMENT, {2}, input_data);
} }
} // namespace test } // namespace test
......
...@@ -15,8 +15,7 @@ ...@@ -15,8 +15,7 @@
#include <memory> #include <memory>
#include "mace/core/operator.h" #include "mace/core/operator.h"
#include "mace/ops/opencl/buffer/buffer_transform.h" #include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/buffer_to_image.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
...@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation { ...@@ -29,29 +28,27 @@ class BufferTransformOp<DeviceType::GPU, T> : public Operation {
public: public:
explicit BufferTransformOp(OpConstructContext *context) explicit BufferTransformOp(OpConstructContext *context)
: Operation(context), : Operation(context),
wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) { wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)),
if (context->device()->opencl_runtime()->UseImageMemory()) { out_mem_type_(static_cast<MemoryType>(Operation::GetOptionalArg<int>(
kernel_.reset(new opencl::image::BufferToImage<T>); "mem_type", static_cast<int>(MemoryType::GPU_IMAGE)))) {}
} else {
kernel_.reset(new opencl::buffer::BufferTransform<T>);
}
}
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(0); const Tensor *input = this->Input(0);
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
ops::BufferType type = auto type =
static_cast<ops::BufferType>(Operation::GetOptionalArg<int>( static_cast<OpenCLBufferType>(Operation::GetOptionalArg<int>(
"buffer_type", static_cast<int>(ops::CONV2D_FILTER))); "buffer_type", static_cast<int>(CONV2D_FILTER)));
return kernel_->Compute(context, input, type, MemoryType in_mem_type = context->workspace()->GetTensor(
wino_blk_size_, output); operator_def_->input(0))->memory_type();
return OpenCLBufferTransformer<T>(in_mem_type, out_mem_type_).Transform(
context, input, type, out_mem_type_, wino_blk_size_, output);
} }
private: private:
const int wino_blk_size_; const int wino_blk_size_;
std::unique_ptr<OpenCLBufferTransformKernel> kernel_; MemoryType out_mem_type_;
}; };
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <cstring> #include <cstring>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
namespace mace { namespace mace {
...@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase { ...@@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase {
namespace { namespace {
template <typename OrgType, typename DstType> template <typename OrgType, typename DstType>
void TestBidirectionTransform(const int type, void TestBidirectionTransform(const OpenCLBufferType type,
const std::vector<index_t> &input_shape) { const std::vector<index_t> &input_shape) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("TransformedOutput")
.AddIntArg("buffer_type", type)
.AddIntArg("T", DataTypeToEnum<DstType>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape); net.AddRandomInput<DeviceType::GPU, OrgType>("Input", input_shape);
Tensor *bt_output = net.ws()->CreateTensor(
// Run "BtOutput", context.device()->allocator(),
net.RunOp(DeviceType::GPU); DataTypeToEnum<DstType>::value);
OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") OpenCLBufferTransformer<DstType>(MemoryType::GPU_BUFFER,
.Input("TransformedOutput") MemoryType::GPU_BUFFER)
.Output("Output") .Transform(&context, net.ws()->GetTensor("Input"),
.AddIntArg("buffer_type", type) type, MemoryType::GPU_BUFFER, 0, bt_output);
.AddIntArg("T", DataTypeToEnum<OrgType>::value)
.Finalize(net.NewOperatorDef()); // Inverse Transform
Tensor *output = net.ws()->CreateTensor(
// Run "Output", context.device()->allocator(),
net.RunOp(DeviceType::GPU); DataTypeToEnum<OrgType>::value);
OpenCLBufferTransformer<OrgType>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, bt_output,
type, MemoryType::GPU_BUFFER, 0, output);
if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) { if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
...@@ -69,12 +70,7 @@ void TestBidirectionTransform(const int type, ...@@ -69,12 +70,7 @@ void TestBidirectionTransform(const int type,
} // namespace } // namespace
TEST_F(BufferTransformTest, FloatToHalf) { TEST_F(BufferTransformTest, FloatToHalf) {
TestBidirectionTransform<float, half>(ops::BufferType::IN_OUT_CHANNEL, TestBidirectionTransform<float, half>(OpenCLBufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4});
}
TEST_F(BufferTransformTest, HalfToHalf) {
TestBidirectionTransform<half, half>(ops::BufferType::IN_OUT_CHANNEL,
{1, 2, 3, 4}); {1, 2, 3, 4});
} }
...@@ -82,25 +78,27 @@ namespace { ...@@ -82,25 +78,27 @@ namespace {
template <typename T> template <typename T>
void TestArgumentTransform(const index_t input_size) { void TestArgumentTransform(const index_t input_size) {
OpsTestNet net; OpsTestNet net;
OpDefBuilder("BufferTransform", "BufferTransformTest") OpContext context(net.ws(),
.Input("Input") OpTestContext::Get()->GetDevice(DeviceType::GPU));
.Output("Output")
.AddIntArg("buffer_type", ops::BufferType::ARGUMENT)
.AddIntArg("T", DataTypeToEnum<T>::value)
.Finalize(net.NewOperatorDef());
// Add input data // Add input data
net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size}); net.AddRandomInput<DeviceType::GPU, T>("Input", {input_size});
// Run // Run
net.RunOp(DeviceType::GPU); Tensor *output = net.ws()->CreateTensor(
"Output", context.device()->allocator(),
DataTypeToEnum<T>::value);
OpenCLBufferTransformer<T>(MemoryType::GPU_BUFFER,
MemoryType::GPU_BUFFER)
.Transform(&context, net.ws()->GetTensor("Input"),
OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
0, output);
auto output_tensor = net.GetOutput("Output");
index_t expected_size = RoundUp<index_t>(input_size, 4); index_t expected_size = RoundUp<index_t>(input_size, 4);
EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]); EXPECT_EQ(expected_size, output->buffer_shape()[0]);
// Check // Check
ExpectTensorNear<T>(*net.GetTensor("Input"), *output_tensor, ExpectTensorNear<T>(*net.GetTensor("Input"), *output,
1e-3, 1e-4); 1e-3, 1e-4);
} }
} // namespace } // namespace
......
...@@ -36,23 +36,11 @@ void ChannelShuffle( ...@@ -36,23 +36,11 @@ void ChannelShuffle(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
if (D == DeviceType::CPU) {
OpDefBuilder("Softmax", "SoftmaxBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage") .Input("Input")
.Output("Output") .Output("Output")
.AddIntArg("group", group) .AddIntArg("group", group)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
// Warm-up // Warm-up
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
......
...@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { ...@@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
"Input", {1, 1, 2, 16}, "Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::GPU, float>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage") .Input("Input")
.Output("OutputImage") .Output("Output")
.AddIntArg("group", 4) .AddIntArg("group", 4)
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
// Transfer output
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto expected = net.CreateTensor<float>( auto expected = net.CreateTensor<float>(
{1, 1, 2, 16}, {1, 1, 2, 16},
......
...@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation { ...@@ -28,7 +28,8 @@ class ConcatOpBase : public Operation {
public: public:
explicit ConcatOpBase(OpConstructContext *context) explicit ConcatOpBase(OpConstructContext *context)
: Operation(context), : Operation(context),
axis_(Operation::GetOptionalArg<int>("axis", 3)) {} axis_(Operation::GetOptionalArg<int>("axis", 3)),
checked_(false) {}
protected: protected:
void Validate() { void Validate() {
...@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation { ...@@ -42,6 +43,7 @@ class ConcatOpBase : public Operation {
protected: protected:
int axis_; int axis_;
bool checked_;
}; };
template <DeviceType D, class T> template <DeviceType D, class T>
...@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase { ...@@ -55,7 +57,15 @@ class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
MACE_UNUSED(context); MACE_UNUSED(context);
if (!checked_) {
Validate(); Validate();
if (this->Input(0)->dim_size() == 4) {
if (axis_ == 3) axis_ = 1;
else if (axis_ == 2) axis_ = 3;
else if (axis_ == 1) axis_ = 2;
}
checked_ = true;
}
const std::vector<const Tensor *> &inputs = this->Inputs(); const std::vector<const Tensor *> &inputs = this->Inputs();
Tensor *output = this->Output(0); Tensor *output = this->Output(0);
const Tensor *input0 = inputs.front(); const Tensor *input0 = inputs.front();
......
...@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128); ...@@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128);
namespace { namespace {
template <typename T> template <typename T>
void OpenclConcatHelper(int iters, void OpenCLConcatHelper(int iters,
const std::vector<index_t> &shape0, const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1, const std::vector<index_t> &shape1,
int concat_dim) { int concat_dim) {
...@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters, ...@@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Concat", "ConcatBM") OpDefBuilder("Concat", "ConcatBM")
.Input("InputImage0") .Input("Input0")
.Input("InputImage1") .Input("Input1")
.AddIntArg("axis", concat_dim) .AddIntArg("axis", concat_dim)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters, ...@@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters,
#define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\ static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\
std::vector<index_t> shape = {N, H, W, C}; \ std::vector<index_t> shape = {N, H, W, C}; \
OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \ OpenCLConcatHelper<TYPE>(iters, shape, shape, 3); \
} \ } \
MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
......
...@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) { ...@@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) {
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
int dim = 5; int dim = 5;
int num_inputs = 2 + rand_r(&seed) % 10; int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim; int axis = 1;
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
...@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
static unsigned int seed = time(NULL); static unsigned int seed = time(NULL);
int dim = 4; int dim = 4;
int num_inputs = 2 + rand_r(&seed) % 10; int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim; int axis = 1;
int axis_arg = 3; // NHWC
// Construct graph // Construct graph
OpsTestNet net; OpsTestNet net;
...@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
std::vector<index_t> output_shape = input_shapes[0]; std::vector<index_t> output_shape = input_shapes[0];
output_shape[axis] = concat_axis_size; output_shape[axis] = concat_axis_size;
net.AddRandomInput<DeviceType::CPU, float>( net.AddRandomInput<DeviceType::CPU, float>(
"Output", output_shape, true, true); "Output", output_shape, false, true, true);
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
builder = builder.Input(MakeString("Input", i)); builder = builder.Input(MakeString("Input", i));
} }
builder.AddIntArg("axis", axis) builder.AddIntArg("axis", axis_arg)
.Output("Output") .Output("Output")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { ...@@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) {
net.RunOp(); net.RunOp();
net.AddRandomInput<DeviceType::CPU, uint8_t>( net.AddRandomInput<DeviceType::CPU, uint8_t>(
"QuantizedOutput", output_shape, true, true); "QuantizedOutput", output_shape, false, true, true);
auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest"); auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
q_builder = q_builder.Input(MakeString("QuantizedInput", i)); q_builder = q_builder.Input(MakeString("QuantizedInput", i));
...@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes, ...@@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
OpsTestNet net; OpsTestNet net;
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string input_name = MakeString("Input", i); const std::string input_name = MakeString("Input", i);
const std::string image_name = MakeString("InputImage", i);
concat_axis_size += shapes[i][axis]; concat_axis_size += shapes[i][axis];
GenerateRandomRealTypeData(shapes[i], &inputs[i]); GenerateRandomRealTypeData(shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data(); input_ptrs[i] = inputs[i].data();
net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i], net.AddInputFromArray<DeviceType::GPU, float>(input_name, shapes[i],
inputs[i]); inputs[i]);
BufferToImage<DeviceType::GPU, T>(&net, input_name, image_name,
ops::BufferType::IN_OUT_CHANNEL);
} }
auto builder = OpDefBuilder("Concat", "ConcatTest"); auto builder = OpDefBuilder("Concat", "ConcatTest");
for (int i = 0; i < num_inputs; ++i) { for (int i = 0; i < num_inputs; ++i) {
const std::string image_name = MakeString("InputImage", i); const std::string image_name = MakeString("Input", i);
builder = builder.Input(image_name); builder = builder.Input(image_name);
} }
builder.AddIntArg("axis", axis) builder.AddIntArg("axis", axis)
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(DeviceType::GPU); net.RunOp(DeviceType::GPU);
ImageToBuffer<DeviceType::GPU, float>(&net, "OutputImage", "Output",
ops::BufferType::IN_OUT_CHANNEL);
// Check // Check
auto output = net.GetOutput("Output"); auto output = net.GetOutput("Output");
......
...@@ -38,8 +38,9 @@ ...@@ -38,8 +38,9 @@
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/conv_2d.h" #include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/buffer/conv_2d.h" #include "mace/ops/opencl/buffer/conv_2d.h"
#include "mace/ops/opencl/image/conv_2d.h"
#endif // MACE_ENABLE_OPENCL #endif // MACE_ENABLE_OPENCL
namespace mace { namespace mace {
...@@ -959,12 +960,44 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -959,12 +960,44 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
activation_(ops::StringToActivationType( activation_(ops::StringToActivationType(
Operation::GetOptionalArg<std::string>("activation", Operation::GetOptionalArg<std::string>("activation",
"NOOP"))), "NOOP"))),
relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) { relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
MemoryType mem_type;
if (context->device()->opencl_runtime()->UseImageMemory()) { if (context->device()->opencl_runtime()->UseImageMemory()) {
mem_type = MemoryType::GPU_IMAGE;
kernel_.reset(new opencl::image::Conv2dKernel<T>); kernel_.reset(new opencl::image::Conv2dKernel<T>);
} else { } else {
mem_type = MemoryType::GPU_BUFFER;
kernel_.reset(new opencl::buffer::Conv2dKernel<T>); kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
} }
context->set_output_mem_type(mem_type);
// Transform filter tensor to target format
if ((wino_block_size_ == 2 || wino_block_size_ == 4) &&
(kernel_->CheckUseWinograd(
context->device()->opencl_runtime(),
context->workspace()->GetTensor(
operator_def_->input(1))->shape(),
std::vector<index_t>(operator_def_->output_shape(0).dims().begin(),
operator_def_->output_shape(0).dims().end()),
strides_.data(),
dilations_.data(),
&wino_block_size_))) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_)
== MaceStatus::MACE_SUCCESS);
} else {
wino_block_size_ = 0;
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 1,
OpenCLBufferType::CONV2D_FILTER, mem_type)
== MaceStatus::MACE_SUCCESS);
}
if (operator_def_->input_size() > 2) {
MACE_CHECK(TransformFilter<T>(
context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type)
== MaceStatus::MACE_SUCCESS);
}
} }
MaceStatus Run(OpContext *context) override { MaceStatus Run(OpContext *context) override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
...@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase { ...@@ -974,13 +1007,14 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
return kernel_->Compute(context, input, filter, bias, return kernel_->Compute(context, input, filter, bias,
strides_.data(), padding_type_, paddings_, strides_.data(), padding_type_, paddings_,
dilations_.data(), activation_, relux_max_limit_, dilations_.data(), activation_, relux_max_limit_,
output); wino_block_size_, output);
} }
private: private:
const ActivationType activation_; const ActivationType activation_;
const float relux_max_limit_; const float relux_max_limit_;
std::unique_ptr<OpenCLConv2dKernel> kernel_; std::unique_ptr<OpenCLConv2dKernel> kernel_;
int wino_block_size_;
private: private:
MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
......
...@@ -49,10 +49,9 @@ void Conv2d(int iters, ...@@ -49,10 +49,9 @@ void Conv2d(int iters,
} }
net.AddRandomInput<D, float>("Filter", net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h, {output_channels, channels, kernel_h,
kernel_w}); kernel_w}, true);
net.AddRandomInput<D, float>("Bias", {output_channels}); net.AddRandomInput<D, float>("Bias", {output_channels}, true);
if (D == DeviceType::CPU) {
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
...@@ -63,26 +62,6 @@ void Conv2d(int iters, ...@@ -63,26 +62,6 @@ void Conv2d(int iters,
.AddIntsArg("dilations", {dilation, dilation}) .AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
} else if (D == DeviceType::GPU) {
BufferToImage<D, T>(&net, "Input", "InputImage",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<D, T>(&net, "Filter", "FilterImage",
ops::BufferType::CONV2D_FILTER);
BufferToImage<D, T>(&net, "Bias", "BiasImage",
ops::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("Output")
.AddIntsArg("strides", {stride, stride})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
} else {
MACE_NOT_IMPLEMENTED;
}
net.Setup(D); net.Setup(D);
...@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters, ...@@ -123,9 +102,9 @@ void Conv2d<CPU, uint8_t>(int iters,
"Input", {batch, height, width, channels}); "Input", {batch, height, width, channels});
net.GetTensor("Input")->SetScale(0.1); net.GetTensor("Input")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, uint8_t>( net.AddRandomInput<DeviceType::CPU, uint8_t>(
"Filter", {output_channels, kernel_h, kernel_w, channels}); "Filter", {output_channels, kernel_h, kernel_w, channels}, true);
net.GetTensor("Filter")->SetScale(0.1); net.GetTensor("Filter")->SetScale(0.1);
net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}); net.AddRandomInput<DeviceType::CPU, int32_t>("Bias", {output_channels}, true);
OpDefBuilder("Conv2D", "Conv2dTest") OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input") .Input("Input")
.Input("Filter") .Input("Filter")
......
此差异已折叠。
...@@ -24,7 +24,7 @@ namespace ops { ...@@ -24,7 +24,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape, void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
Padding padding, Padding padding,
...@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC ...@@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
void CalcOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *padding_size, const int *padding_size,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
......
...@@ -35,7 +35,7 @@ namespace ops { ...@@ -35,7 +35,7 @@ namespace ops {
void CalcPaddingAndOutputSize(const index_t *input_shape, void CalcPaddingAndOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
Padding padding, Padding padding,
...@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, ...@@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
void CalcOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape,
const DataFormat input_format, const DataFormat input_format,
const index_t *filter_shape, const index_t *filter_shape,
const DataFormat filter_format, const FilterDataFormat filter_format,
const int *padding_size, const int *padding_size,
const int *dilations, const int *dilations,
const int *strides, const int *strides,
......
此差异已折叠。
...@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6); ...@@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6);
namespace { namespace {
template <typename T> template <typename T>
void OpenclCropHelper(int iters, void OpenCLCropHelper(int iters,
const std::vector<index_t> &shape0, const std::vector<index_t> &shape0,
const std::vector<index_t> &shape1, const std::vector<index_t> &shape1,
int crop_axis, int crop_axis,
...@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters, ...@@ -79,16 +79,12 @@ void OpenclCropHelper(int iters,
net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0); net.AddRandomInput<DeviceType::GPU, float>("Input0", shape0);
net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1); net.AddRandomInput<DeviceType::GPU, float>("Input1", shape1);
BufferToImage<DeviceType::GPU, T>(&net, "Input0", "InputImage0",
ops::BufferType::IN_OUT_CHANNEL);
BufferToImage<DeviceType::GPU, T>(&net, "Input1", "InputImage1",
ops::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Crop", "CropBM") OpDefBuilder("Crop", "CropBM")
.Input("InputImage0") .Input("Input0")
.Input("InputImage1") .Input("Input1")
.AddIntArg("axis", crop_axis) .AddIntArg("axis", crop_axis)
.AddIntsArg("offset", {offset}) .AddIntsArg("offset", {offset})
.Output("OutputImage") .Output("Output")
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value)) .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
...@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters, ...@@ -114,7 +110,7 @@ void OpenclCropHelper(int iters,
_##TYPE(int iters) { \ _##TYPE(int iters) { \
std::vector<index_t> shape0 = {N, H, W, C}; \ std::vector<index_t> shape0 = {N, H, W, C}; \
std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2}; \ std::vector<index_t> shape1 = {N / 2, H / 2, W / 2, C / 2}; \
OpenclCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \ OpenCLCropHelper<TYPE>(iters, shape0, shape1, AXIS, OFFSET); \
} \ } \
MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\ MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\
##_##TYPE) ##_##TYPE)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform( ...@@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform(
} }
}; };
} }
// Mark the buffer unused.
const_cast<Tensor *>(input)->MarkUnused();
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册