diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 2163c44c1563e35404cf51eadd36c27f717969d0..1c32b7996e0255bc288b3310fd2db76a952e1112 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -69,9 +69,9 @@ in one deployment file. - The output tensor name(s) (tensorflow) or top name(s) of outputs' layer (caffe). If there are more than one tensors, use one line for a tensor. * - input_shapes - - The shapes of the input tensors, in NHWC order. + - The shapes of the input tensors, default is NHWC order. * - output_shapes - - The shapes of the output tensors, in NHWC order. + - The shapes of the output tensors, default is NHWC order. * - input_ranges - The numerical range of the input tensors' data, default [-1, 1]. It is only for test. * - validation_inputs_data @@ -84,6 +84,10 @@ in one deployment file. - [optional] The data type used for specified runtime. [fp16_fp32, fp32_fp32] for GPU, default is fp16_fp32, [fp32] for CPU and [uint8] for DSP. * - input_data_types - [optional] The input data type for specific op(eg. gather), which can be [int32, float32], default to float32. + * - input_data_formats + - [optional] The format of the input tensors, one of [NONE, NHWC]. If there is no format of the input, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order. + * - output_data_formats + - [optional] The format of the output tensors, one of [NONE, NHWC]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order. * - limit_opencl_kernel_time - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0. * - obfuscate diff --git a/docs/user_guide/devices/demo_device_nanopi.yml b/docs/user_guide/devices/demo_device_nanopi.yml new file mode 100644 index 0000000000000000000000000000000000000000..567f7c7e1ce08af39134527d9eae825a688cb76f --- /dev/null +++ b/docs/user_guide/devices/demo_device_nanopi.yml @@ -0,0 +1,23 @@ +# one yaml config file can contain multi device info +devices: + # The name of the device + nanopi: + # arm64 or armhf + target_abis: [arm64, armhf] + # device soc, you can get it from device manual + target_socs: RK3399 + # device model full name + models: FriendlyElec Nanopi M4 + # device ip address + address: 10.0.0.0 + # login username + username: user + # login password, is required when you can login into device without password + password: 1234567 + raspberry: + target_abis: [armv7l] + target_socs: BCM2837 + models: Raspberry Pi 3 Model B Plus Rev 1.3 + address: 10.0.0.1 + username: user + password: 123456 diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc index 60fb38f7d71895db95ccd1ec88a765b5fecfc5cc..cd3c4d1f2071a547f4a5c034629b45deefe74b28 100644 --- a/mace/core/arg_helper.cc +++ b/mace/core/arg_helper.cc @@ -95,4 +95,12 @@ MACE_GET_REPEATED_ARGUMENT_FUNC(float, floats, false) MACE_GET_REPEATED_ARGUMENT_FUNC(int, ints, true) MACE_GET_REPEATED_ARGUMENT_FUNC(int64_t, ints, true) #undef MACE_GET_REPEATED_ARGUMENT_FUNC + + +bool IsQuantizedModel(const NetDef &net_def) { + return + ProtoArgHelper::GetOptionalArg(net_def, "quantize_flag", 0) + == 1; +} + } // namespace mace diff --git a/mace/core/arg_helper.h b/mace/core/arg_helper.h index 50ec4eade9c05eb12d0b555595a665e590a14965..238b0800e5f971287bb1e85c592b9ea5af01eaf3 100644 --- a/mace/core/arg_helper.h +++ b/mace/core/arg_helper.h @@ -55,6 +55,8 @@ class ProtoArgHelper { std::map arg_map_; }; +bool IsQuantizedModel(const NetDef &def); + } // namespace mace #endif // MACE_CORE_ARG_HELPER_H_ diff --git a/mace/core/buffer.h b/mace/core/buffer.h index 521ccc82820597275adc387a3cd47e235e52df81..c859268f818d998983d610333636f187195e8aea 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -233,6 +233,11 @@ class Image : public BufferBase { } } + inline DataType dtype() const { + MACE_CHECK_NOTNULL(buf_); + return data_type_; + } + void *buffer() { MACE_CHECK_NOTNULL(buf_); return buf_; diff --git a/mace/core/device.h b/mace/core/device.h index bfa00b02f95c3fe9ab5af78dcc264f79ecc679df..b7fe5f329b99401d31b04af102b2ca1d32d06bff 100644 --- a/mace/core/device.h +++ b/mace/core/device.h @@ -34,7 +34,7 @@ class Device { #ifdef MACE_ENABLE_OPENCL virtual OpenCLRuntime *opencl_runtime() = 0; -#endif +#endif // MACE_ENABLE_OPENCL virtual CPURuntime *cpu_runtime() = 0; virtual Allocator *allocator() = 0; diff --git a/mace/core/memory_optimizer.cc b/mace/core/memory_optimizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..f773befca2f686bc17062ecf2cde19f22c68a81e --- /dev/null +++ b/mace/core/memory_optimizer.cc @@ -0,0 +1,270 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/memory_optimizer.h" + +#include +#include +#include +#include +#include + +#include "mace/core/arg_helper.h" +#include "mace/core/macros.h" +#include "mace/utils/logging.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_util.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { + +bool MemoryOptimizer::IsMemoryReuseOp(const std::string &op_type) { + static const std::unordered_set kReuseOp = { + "Reshape", "Identity", "Squeeze", "ExpandDims" + }; + return kReuseOp.count(op_type) == 1; +} + +void MemoryOptimizer::UpdateTensorRef(const std::string &tensor_name) { + if (tensor_ref_count_.count(tensor_name) == 0) { + tensor_ref_count_.emplace(tensor_name, 1); + } else { + tensor_ref_count_[tensor_name] += 1; + } +} + +void MemoryOptimizer::UpdateTensorRef(const mace::OperatorDef *op_def) { + int input_size = op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + if (tensor_ref_count_.count(op_def->input(i)) == 1) { + tensor_ref_count_[op_def->input(i)] += 1; + } + } + int output_size = op_def->output_size(); + for (int i = 0; i < output_size; ++i) { + if (tensor_ref_count_.count(op_def->output(i)) == 0) { + tensor_ref_count_.emplace(op_def->output(i), 0); + } + } +} + +MemoryBlock MemoryOptimizer::CreateMemoryBlock( + std::vector shape, + DataType dt, + mace::MemoryType mem_type) { + MemoryBlock block; +#ifdef MACE_ENABLE_OPENCL + if (mem_type == MemoryType::GPU_IMAGE) { + std::vector image_shape; + if (shape.size() == 2) { + shape = {shape[0], 1, 1, shape[1]}; + } else { + MACE_CHECK(shape.size() == 4) << "GPU only support 2D/4D input"; + } + OpenCLUtil::CalImage2DShape(shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); + block.set_x(image_shape[0]); + block.set_y(image_shape[1]); + return block; + } +#endif // MACE_ENABLE_OPENCL + MACE_UNUSED(mem_type); + int64_t op_mem_size = std::accumulate(shape.begin(), + shape.end(), + GetEnumTypeSize(dt), + std::multiplies()); + block.set_x(op_mem_size); + block.set_y(1); + return block; +} + +void MemoryOptimizer::Optimize( + const mace::OperatorDef *op_def, + const std::unordered_map &mem_types) { + MACE_LATENCY_LOGGER(2, "Optimize memory"); + if (op_def->output_size() != op_def->output_shape_size()) { + VLOG(1) << op_def->name() + << ": the number of output shape " + << "is not equal to the number of output"; + return; + } + + auto device = static_cast(op_def->device_type()); + DataType op_dtype = static_cast(ProtoArgHelper::GetOptionalArg( + *op_def, + "T", + static_cast(DT_FLOAT))); + MACE_CHECK( + op_def->output_type_size() == 0 || + op_def->output_size() == op_def->output_type_size(), + "operator output size != operator output type size", + op_def->output_size(), + op_def->output_type_size()); + DataType dt; + + int output_size = op_def->output_size(); + for (int i = 0; i < output_size; ++i) { + if (i < op_def->output_type_size()) { + dt = op_def->output_type(i); + } else { + dt = op_dtype; + } + int best_mem_id = -1; + MemoryType mem_type = MemoryType::CPU_BUFFER; + if (device == DeviceType::GPU) { + mem_type = mem_types.at(op_def->output(i)); + } + auto shape = std::vector( + op_def->output_shape(i).dims().begin(), + op_def->output_shape(i).dims().end()); + MemoryBlock op_mem_block = CreateMemoryBlock(shape, dt, mem_type); + MemoryBlock best_mem_block; + if (IsMemoryReuseOp(op_def->type())) { + if (tensor_mem_map_.count(op_def->input(0)) == 1) { + best_mem_id = tensor_mem_map_[op_def->input(0)].first; + } + } else { + auto shape = std::vector( + op_def->output_shape(i).dims().begin(), + op_def->output_shape(i).dims().end()); + + int64_t op_mem_size = op_mem_block.x() * op_mem_block.y(); + int64_t best_added_mem_size = LLONG_MAX; + int64_t best_wasted_mem_size = LLONG_MAX; + + int64_t old_mem_size = 0, new_mem_size = 0; + MemoryBlock new_mem_block; + for (auto idle_mem_id : idle_blocks_) { + if (mem_blocks_[idle_mem_id].mem_type() == mem_type) { + if (mem_type == MemoryType::GPU_IMAGE) { + // GPU Image could reuse memory with same data type only + if (mem_blocks_[idle_mem_id].data_type() != dt) { + continue; + } + old_mem_size = + mem_blocks_[idle_mem_id].x() * mem_blocks_[idle_mem_id].y(); + new_mem_block.set_x(std::max(mem_blocks_[idle_mem_id].x(), + op_mem_block.x())); + new_mem_block.set_y(std::max(mem_blocks_[idle_mem_id].y(), + op_mem_block.y())); + new_mem_size = new_mem_block.x() * new_mem_block.y(); + } else { + old_mem_size = mem_blocks_[idle_mem_id].x(); + new_mem_size = std::max(op_mem_size, old_mem_size); + new_mem_block.set_x(new_mem_size); + } + int64_t added_mem_size = new_mem_size - old_mem_size; + int64_t wasted_mem_size = new_mem_size - op_mem_size; + // minimize add_mem_size; if best_mem_add_size is 0, + // then minimize waste_mem_size + if ((best_added_mem_size > 0 && added_mem_size < best_added_mem_size) + || (best_added_mem_size == 0 && + wasted_mem_size < best_wasted_mem_size)) { + best_mem_id = idle_mem_id; + best_added_mem_size = added_mem_size; + best_wasted_mem_size = wasted_mem_size; + best_mem_block = new_mem_block; + } + } + } + + if (best_added_mem_size <= op_mem_size) { + best_mem_block.set_mem_id(best_mem_id); + best_mem_block.set_data_type(dt); + best_mem_block.set_mem_type(mem_type); + mem_blocks_[best_mem_id] = best_mem_block; + idle_blocks_.erase(best_mem_id); + } else { + best_mem_id = static_cast(mem_blocks_.size()); + best_mem_block.set_mem_id(best_mem_id); + best_mem_block.set_data_type(dt); + best_mem_block.set_mem_type(mem_type); + best_mem_block.set_x(op_mem_block.x()); + best_mem_block.set_y(op_mem_block.y()); + mem_blocks_.push_back(best_mem_block); + } + } + + if (best_mem_id != -1) { + if (mem_ref_count_.count(best_mem_id) == 1) { + mem_ref_count_[best_mem_id] += 1; + } else { + mem_ref_count_[best_mem_id] = 1; + } + tensor_mem_map_[op_def->output(i)] = std::make_pair(best_mem_id, dt); + } + } + + // de-refer input tensors + int input_size = op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + auto &input_name = op_def->input(i); + if (tensor_ref_count_.count(input_name) == 1) { + tensor_ref_count_[input_name] -= 1; + if (tensor_ref_count_.at(input_name) == 0 && + tensor_mem_map_.count(input_name) == 1) { + int mem_id = tensor_mem_map_.at(input_name).first; + mem_ref_count_[mem_id] -= 1; + if (mem_ref_count_.at(mem_id) == 0) { + idle_blocks_.insert(mem_id); + } + } else { + MACE_CHECK(tensor_ref_count_.at(input_name) >= 0); + } + } + } +} + +const std::vector& MemoryOptimizer::mem_blocks() const { + return mem_blocks_; +} + +const std::unordered_map>& + MemoryOptimizer::tensor_mem_map() const { + return tensor_mem_map_; +} + +std::string MemoryOptimizer::DebugInfo() const { + auto memory_type_to_str = [](const MemoryType type) -> std::string { + if (type == MemoryType::CPU_BUFFER) { + return "CPU_BUFFER"; + } else if (type == MemoryType::GPU_BUFFER) { + return "GPU_BUFFER"; + } else if (type == MemoryType::GPU_IMAGE) { + return "GPU_IMAGE"; + } else { + return "UNKNOWN"; + } + }; + std::stringstream sstream; + sstream << "\n"; + size_t block_size = mem_blocks_.size(); + for (size_t i = 0; i < block_size; ++i) { + sstream << i << " " << memory_type_to_str(mem_blocks_[i].mem_type()) + << " "; + if (mem_blocks_[i].mem_type() == MemoryType::GPU_IMAGE) { + sstream << DataTypeToString(mem_blocks_[i].data_type()) << " " + "[" << mem_blocks_[i].x() << ", " << mem_blocks_[i].y() << "]"; + } else { + sstream << "[" << mem_blocks_[i].x() << "]"; + } + sstream << "\n"; + } + + return sstream.str(); +} + +} // namespace mace diff --git a/mace/core/memory_optimizer.h b/mace/core/memory_optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..fa24a206f1e42a2c2f3d5e7dde16b648dc19e2b0 --- /dev/null +++ b/mace/core/memory_optimizer.h @@ -0,0 +1,110 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_MEMORY_OPTIMIZER_H_ +#define MACE_CORE_MEMORY_OPTIMIZER_H_ + +#include +#include +#include +#include +#include + +#include "mace/proto/mace.pb.h" +#include "mace/core/types.h" + +namespace mace { + +class MemoryBlock { + public: + inline void set_mem_id(int mem_id) { + mem_id_ = mem_id; + } + + inline int mem_id() const { + return mem_id_; + } + + inline void set_data_type(DataType data_type) { + data_type_ = data_type; + } + + inline DataType data_type() const { + return data_type_; + } + + inline void set_mem_type(MemoryType mem_type) { + mem_type_ = mem_type; + } + + inline MemoryType mem_type() const { + return mem_type_; + } + + inline void set_x(int64_t x) { + x_ = x; + } + + inline int64_t x() const { + return x_; + } + + inline void set_y(int64_t y) { + y_ = y; + } + + inline int64_t y() const { + return y_; + } + + private: + int mem_id_; + DataType data_type_; + MemoryType mem_type_; + int64_t x_; + int64_t y_; +}; + +class MemoryOptimizer { + public: + static bool IsMemoryReuseOp(const std::string &op_type); + void UpdateTensorRef(const std::string &tensor_name); + void UpdateTensorRef(const OperatorDef *op_def); + void Optimize(const OperatorDef *op_def, + const std::unordered_map &mem_types); + + const std::vector &mem_blocks() const; + + const std::unordered_map> &tensor_mem_map() const; + + std::string DebugInfo() const; + + private: + MemoryBlock CreateMemoryBlock(std::vector shape, + DataType dt, + MemoryType mem_type); + + private: + std::unordered_map tensor_ref_count_; + std::vector mem_blocks_; + // tensor name : + // Buffer Memory do not different data type, so store the data type. + std::unordered_map> tensor_mem_map_; + std::unordered_map mem_ref_count_; + std::set idle_blocks_; +}; + +} // namespace mace +#endif // MACE_CORE_MEMORY_OPTIMIZER_H_ diff --git a/mace/core/net.cc b/mace/core/net.cc index 1fe5b0e947fb4d7f9f5d82b91e48f6096cdfcb8b..279724f6e791623923e8772b5db88a4bb8293413 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -18,6 +18,7 @@ #include "mace/core/future.h" #include "mace/core/macros.h" +#include "mace/core/memory_optimizer.h" #include "mace/core/net.h" #include "mace/core/op_context.h" #include "mace/public/mace.h" @@ -25,13 +26,94 @@ #include "mace/utils/timer.h" #include "mace/utils/utils.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_util.h" +#endif // MACE_ENABLE_OPENCL + namespace mace { +namespace { +struct InternalOutputInfo { + InternalOutputInfo(const MemoryType mem_type, + const DataType dtype, + const std::vector &shape, + int op_idx) + : mem_type(mem_type), dtype(dtype), shape(shape), op_idx(op_idx) {} + + MemoryType mem_type; // transformed memory type + DataType dtype; + std::vector shape; // tensor shape + int op_idx; // operation which generate the tensor +}; + +#ifdef MACE_ENABLE_OPENCL +std::string TransformedName(const std::string &input_name, + const mace::MemoryType mem_type) { + std::stringstream ss; + ss << input_name << "_mem_type_" << mem_type; + return ss.str(); +} +#endif // MACE_ENABLE_OPENCL + +} // namespace + +std::unique_ptr SerialNet::CreateOperation( + const OpRegistryBase *op_registry, + OpConstructContext *construct_context, + std::shared_ptr op_def, + DataFormat data_format_flag, + bool is_quantize_model) { + // Create the Operation + DeviceType target_device_type = target_device_->device_type(); + // Get available devices + auto available_devices = op_registry->AvailableDevices(op_def->type()); + // Find the device type to run the op. + // If the target_device_type in available devices, use target_device_type, + // otherwise, fallback to CPU device. + DeviceType device_type = DeviceType::CPU; + construct_context->set_device(cpu_device_); + construct_context->set_output_mem_type(MemoryType::CPU_BUFFER); + for (auto device : available_devices) { + if (device == target_device_type) { + device_type = target_device_type; + construct_context->set_device(target_device_); + if (target_device_->device_type() == DeviceType::GPU) { + construct_context->set_output_mem_type(MemoryType::GPU_IMAGE); + } + break; + } + } + op_def->set_device_type(device_type); + // transpose output shape if run on CPU (default format is NHWC) + if (!is_quantize_model && device_type == DeviceType::CPU && + op_def->output_shape_size() == op_def->output_size()) { + for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { + if (data_format_flag == NHWC && + op_def->output_shape(out_idx).dims_size() == 4) { + // NHWC -> NCHW + std::vector output_shape = + TransposeShape( + std::vector( + op_def->output_shape(out_idx).dims().begin(), + op_def->output_shape(out_idx).dims().end()), + {0, 3, 1, 2}); + for (int i = 0; i < 4; ++i) { + op_def->mutable_output_shape(out_idx)->set_dims(i, output_shape[i]); + } + } + } + } + construct_context->set_operator_def(op_def); + std::unique_ptr op( + op_registry->CreateOperation(construct_context, device_type)); + return std::move(op); +} + SerialNet::SerialNet(const OpRegistryBase *op_registry, const NetDef *net_def, Workspace *ws, Device *target_device, - const NetMode mode) + MemoryOptimizer *mem_optimizer) : NetBase(), ws_(ws), target_device_(target_device), @@ -40,49 +122,213 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, target_device->cpu_runtime()->policy(), target_device->cpu_runtime()->use_gemmlowp())) { MACE_LATENCY_LOGGER(1, "Constructing SerialNet"); - // Create Operations - DeviceType target_device_type = target_device_->device_type(); + // output tensor : related information + std::unordered_map output_map; + // used for memory optimization + std::unordered_map output_mem_map; + std::unordered_map transformed_map; + // add input information + MemoryType target_mem_type; + // quantize model flag + bool is_quantize_model = IsQuantizedModel(*net_def); + // + DataFormat data_format_flag = NHWC; + if (target_device_->device_type() == DeviceType::CPU) { + target_mem_type = MemoryType::CPU_BUFFER; + for (auto &input_info : net_def->input_info()) { + std::vector input_shape = + std::vector(input_info.dims().begin(), + input_info.dims().end()); + // Only could be NONE or NHWC + auto input_data_format = static_cast( + input_info.data_format()); + if (!is_quantize_model && + input_data_format == NHWC && + input_info.dims_size() == 4) { + // NHWC -> NCHW + input_shape = + TransposeShape(input_shape, {0, 3, 1, 2}); + } else if (input_data_format == DataFormat::DF_NONE) { + data_format_flag = DataFormat::DF_NONE; + } + output_map.emplace(input_info.name(), InternalOutputInfo( + target_mem_type, DataType::DT_FLOAT, input_shape, -1)); + } + } +#ifdef MACE_ENABLE_OPENCL + else { // GPU NOLINT[readability/braces] + target_mem_type = MemoryType::GPU_BUFFER; + for (auto &input_info : net_def->input_info()) { + std::vector input_shape = + std::vector(input_info.dims().begin(), + input_info.dims().end()); + output_map.emplace(input_info.name(), InternalOutputInfo( + target_mem_type, DataType::DT_FLOAT, input_shape, -1)); + } + } +#endif // MACE_ENABLE_OPENCL + OpConstructContext construct_context(ws_); for (int idx = 0; idx < net_def->op_size(); ++idx) { - const auto &operator_def = net_def->op(idx); - // Create the Operation - const int op_device = - ProtoArgHelper::GetOptionalArg( - operator_def, "device", static_cast(target_device_type)); - if (op_device == target_device_type) { - // Get available devices (sorted based on priority) - OperatorDef temp_def(operator_def); - auto available_devices = op_registry->AvailableDevices(temp_def.type()); - // Find the device type to run the op. - // If the target_device_type in available devices, use target_device_type, - // otherwise, fallback to CPU device. - DeviceType device_type = DeviceType::CPU; - construct_context.set_device(cpu_device_); - for (auto device : available_devices) { - if (device == target_device_type) { - device_type = target_device_type; - construct_context.set_device(target_device_); - break; + std::shared_ptr op_def(new OperatorDef(net_def->op(idx))); + // Create operation + auto op = CreateOperation(op_registry, + &construct_context, + op_def, + data_format_flag, + is_quantize_model); +#ifdef MACE_ENABLE_OPENCL + // Add input transform operation if necessary + if (target_device_->device_type() == DeviceType::GPU) { + const DataType dt = + static_cast( + ProtoArgHelper::GetOptionalArg( + *op_def, "T", static_cast(DataType::DT_FLOAT))); + // the outputs' memory type of the operation + MemoryType out_mem_type = construct_context.output_mem_type(); + int input_size = op_def->input_size(); + for (int i = 0; i < input_size; ++i) { + if (output_map.count(op_def->input(i)) == 1) { + // if op is memory-reuse op, no transformation + if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) { + out_mem_type = output_map.at(op_def->input(i)).mem_type; + break; + } + // check whether is the output tensor of other operation + if (output_map.at(op_def->input(i)).mem_type != out_mem_type || + output_map.at(op_def->input(i)).dtype != dt) { + auto key = TransformedName(op_def->input(i), out_mem_type); + auto &output_info = output_map.at(op_def->input(i)); + // check whether the tensor has been transformed + if (transformed_map.count(key) == 0) { + VLOG(1) << "Add Transform operation to transform tensor '" + << op_def->input(i) << "', from memory type " + << output_info.mem_type << " to " << out_mem_type + << ", from Data Type " << output_info.dtype << " to " + << dt; + std::string input_name = op_def->input(i); + std::string t_input_name = + TransformedName(input_name, + out_mem_type); + op_def->set_input(i, t_input_name); + auto input_shape = output_info.shape; + if (output_info.mem_type == MemoryType::CPU_BUFFER && + input_shape.size() == 4) { + // NCHW -> NHWC + input_shape = + TransposeShape(input_shape, + {0, 2, 3, 1}); + } + auto transform_op_def = OpenCLUtil::CreateTransformOpDef( + input_name, input_shape, t_input_name, + dt, out_mem_type); + auto transform_op = CreateOperation( + op_registry, + &construct_context, + transform_op_def, + data_format_flag); + operators_.emplace_back(std::move(transform_op)); + transformed_map.emplace(key, t_input_name); + output_mem_map[t_input_name] = out_mem_type; + // where to do graph reference count. + mem_optimizer->UpdateTensorRef(transform_op_def.get()); + } else { + op_def->set_input(i, transformed_map[key]); + } + } + } else { + MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr + && ws_->GetTensor(op_def->input(i))->is_weight(), + "Tensor ", op_def->input(i), " of ", + op_def->name(), " not allocated"); } } - temp_def.set_device_type(device_type); - construct_context.set_operator_def(&temp_def); - std::unique_ptr op( - op_registry->CreateOperation(&construct_context, device_type, mode)); - if (op) { - operators_.emplace_back(std::move(op)); + // update the map : output_tensor -> Operation + for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) { + output_mem_map[op_def->output(out_idx)] = out_mem_type; + output_map.emplace( + op_def->output(out_idx), + InternalOutputInfo( + out_mem_type, + dt, + op_def->output_shape().empty() ? + std::vector() : + std::vector( + op_def->output_shape(out_idx).dims().begin(), + op_def->output_shape(out_idx).dims().end()), + static_cast(operators_.size()))); + } + } +#endif // MACE_ENABLE_OPENCL + operators_.emplace_back(std::move(op)); + // where to do graph reference count. + mem_optimizer->UpdateTensorRef(op_def.get()); + } + +#ifdef MACE_ENABLE_OPENCL + // Transform the output tensor if necessary + if (target_device_->device_type() == DeviceType::GPU) { + for (auto &output_info : net_def->output_info()) { + auto &internal_output_info = output_map.at(output_info.name()); + if ((internal_output_info.mem_type != target_mem_type && + internal_output_info.mem_type != MemoryType::CPU_BUFFER) || + internal_output_info.dtype != DataType::DT_FLOAT) { + VLOG(1) << "Add Transform operation to transform output tensor '" + << output_info.name() << "', from memory type " + << internal_output_info.mem_type + << " to " << target_mem_type + << ", from Data Type " << internal_output_info.dtype + << " to " << DataType::DT_FLOAT; + std::string t_output_name = TransformedName(output_info.name(), + target_mem_type); + auto output_op_def = + operators_[internal_output_info.op_idx]->operator_def(); + int output_size = output_op_def->output_size(); + for (int i = 0; i < output_size; ++i) { + if (output_op_def->output(i) == output_info.name()) { + output_op_def->set_output(i, t_output_name); + // update the output : mem_type map + output_mem_map[t_output_name] = output_mem_map[output_info.name()]; + output_mem_map[output_info.name()] = target_mem_type; + } + } + auto output_data_format = + static_cast(output_info.data_format()); + auto transform_op_def = OpenCLUtil::CreateTransformOpDef( + t_output_name, + internal_output_info.shape, + output_info.name(), + DataType::DT_FLOAT, + target_mem_type); + auto transform_op = CreateOperation( + op_registry, + &construct_context, + transform_op_def, + output_data_format); + operators_.emplace_back(std::move(transform_op)); + // where to do graph reference count. + mem_optimizer->UpdateTensorRef(transform_op_def.get()); } } } +#endif // MACE_ENABLE_OPENCL + // Update output tensor reference + for (auto &output_info : net_def->output_info()) { + mem_optimizer->UpdateTensorRef(output_info.name()); + } + + // Do memory optimization + for (auto &op : operators_) { + VLOG(2) << "Operator " << op->debug_def().name() << "<" << op->device_type() + << ", " << op->debug_def().type() << ">"; + mem_optimizer->Optimize(op->operator_def().get(), output_mem_map); + } + VLOG(1) << mem_optimizer->DebugInfo(); } MaceStatus SerialNet::Init() { MACE_LATENCY_LOGGER(1, "Initializing SerialNet"); OpInitContext init_context(ws_); - // TODO(liuqi): where to do memory reuse. - if (target_device_->device_type() == DeviceType::GPU) { - - } for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; DeviceType device_type = op->device_type(); @@ -98,18 +344,18 @@ MaceStatus SerialNet::Init() { } MaceStatus SerialNet::Run(RunMetadata *run_metadata) { - // TODO(liuqi): In/Out Buffer Transform MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); OpContext context(ws_, cpu_device_); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { auto &op = *iter; DeviceType device_type = op->device_type(); - MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), - "<", device_type, ", ", op->debug_def().type(), ">", - ". mem_id: ", - MakeListString(op->debug_def().mem_id().data(), - op->debug_def().mem_id().size())); + MACE_LATENCY_LOGGER(1, "Running operator ", op->debug_def().name(), + "<", device_type, ", ", op->debug_def().type(), + ", ", + ProtoArgHelper::GetOptionalArg( + op->debug_def(), "T", static_cast(DT_FLOAT)), + ">"); if (device_type == target_device_->device_type()) { context.set_device(target_device_); } else { @@ -176,7 +422,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { float max_v = std::numeric_limits::lowest(); float min_v = std::numeric_limits::max(); Tensor::MappingGuard guard(op->Output(i)); - const float *output_data = op->Output(i)->data(); + auto *output_data = op->Output(i)->data(); for (index_t j = 0; j < op->Output(i)->size(); ++j) { max_v = std::max(max_v, output_data[j]); min_v = std::min(min_v, output_data[j]); @@ -192,14 +438,14 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) { std::vector bin_distribution(bin_size, 0); float bin_v = (max_v - min_v) / bin_size; Tensor::MappingGuard guard(op->Output(i)); - const float *output_data = op->Output(i)->data(); + auto *output_data = op->Output(i)->data(); for (index_t j = 0; j < op->Output(i)->size(); ++j) { - int ind = static_cast((output_data[j] - min_v) / bin_v); - if (ind < 0) - ind = 0; - else if (ind > bin_size-1) - ind = bin_size-1; - bin_distribution[ind]++; + int index = static_cast((output_data[j] - min_v) / bin_v); + if (index < 0) + index = 0; + else if (index > bin_size-1) + index = bin_size-1; + bin_distribution[index]++; } LOG(INFO) << "Tensor range @@" << op->debug_def().output(i) << "@@" << min_v << "," << max_v<< "@@" diff --git a/mace/core/net.h b/mace/core/net.h index d5a6725f7265f86feabf7d6f4c82874c8394c7e0..10577a572f5a0629ae515d9b330befbaa639016e 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -27,6 +27,7 @@ namespace mace { class RunMetadata; class Workspace; +class MemoryOptimizer; class NetBase { public: @@ -47,12 +48,20 @@ class SerialNet : public NetBase { const NetDef *net_def, Workspace *ws, Device *target_device, - const NetMode mode = NetMode::NORMAL); + MemoryOptimizer * mem_optimizer); MaceStatus Init() override; MaceStatus Run(RunMetadata *run_metadata = nullptr) override; + private: + std::unique_ptr CreateOperation( + const OpRegistryBase *op_registry, + OpConstructContext *construct_context, + std::shared_ptr op_def, + DataFormat input_format, + bool is_quantize_model = false); + protected: Workspace *ws_; Device *target_device_; diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 9a1da4c81170f76f5b18325f3149e5836c5f75cd..6a437f884c506af231db882a500560bdd8dc67ec 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -23,16 +23,12 @@ namespace mace { OpConstructContext::OpConstructContext(Workspace *ws) : operator_def_(nullptr), ws_(ws), device_(nullptr) {} -OpConstructContext::OpConstructContext(OperatorDef *operator_def, - Workspace *ws, - Device *device) - : operator_def_(operator_def), ws_(ws), device_(device) {} OpInitContext::OpInitContext(Workspace *ws, Device *device) : ws_(ws), device_(device) {} Operation::Operation(OpConstructContext *context) - : operator_def_(std::make_shared(*(context->operator_def()))) + : operator_def_(context->operator_def()) {} MaceStatus Operation::Init(OpInitContext *context) { @@ -43,11 +39,9 @@ MaceStatus Operation::Init(OpInitContext *context) { ": Encountered a non-existing input tensor: ", input_str); inputs_.push_back(tensor); } - // TODO(liuqi): filter transform for (int i = 0; i < operator_def_->output_size(); ++i) { const std::string output_str = operator_def_->output(i); if (ws->HasTensor(output_str)) { - // TODO(liuqi): Workspace should pre-allocate all of the output tensors outputs_.push_back(ws->GetTensor(output_str)); } else { MACE_CHECK( @@ -66,15 +60,14 @@ MaceStatus Operation::Init(OpInitContext *context) { } outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor( output_str, context->device()->allocator(), output_type))); - - if (i < operator_def_->output_shape_size()) { - std::vector - shape_configured(operator_def_->output_shape(i).dims_size()); - for (size_t dim = 0; dim < shape_configured.size(); ++dim) { - shape_configured[dim] = operator_def_->output_shape(i).dims(dim); - } - ws->GetTensor(output_str)->SetShapeConfigured(shape_configured); + } + if (i < operator_def_->output_shape_size()) { + std::vector + shape_configured(operator_def_->output_shape(i).dims_size()); + for (size_t dim = 0; dim < shape_configured.size(); ++dim) { + shape_configured[dim] = operator_def_->output_shape(i).dims(dim); } + ws->GetTensor(output_str)->SetShapeConfigured(shape_configured); } } return MaceStatus::MACE_SUCCESS; @@ -164,33 +157,34 @@ const std::set OpRegistryBase::AvailableDevices( std::unique_ptr OpRegistryBase::CreateOperation( OpConstructContext *context, - DeviceType device_type, - const NetMode mode) const { - OperatorDef *operator_def = context->operator_def(); - const DataType dtype = static_cast( + DeviceType device_type) const { + auto operator_def = context->operator_def(); + DataType dtype = static_cast( ProtoArgHelper::GetOptionalArg( *operator_def, "T", static_cast(DT_FLOAT))); - const int op_mode_i = ProtoArgHelper::GetOptionalArg( - *operator_def, "mode", static_cast(NetMode::NORMAL)); - const NetMode op_mode = static_cast(op_mode_i); - VLOG(3) << "Creating operator " << operator_def->name() << "(" + if (device_type == DeviceType::CPU && dtype == DT_HALF) { + int arg_size = operator_def->arg_size(); + for (int i = 0; i < arg_size; ++i) { + if (operator_def->arg(i).name() == "T") { + operator_def->mutable_arg(i)->set_i(DT_FLOAT); + } + } + dtype = DT_FLOAT; + } + VLOG(1) << "Creating operator " << operator_def->name() << "(" << operator_def->type() << "<" << dtype << ">" << ") on " << device_type; - if (op_mode == mode) { - const std::string op_type = context->operator_def()->type(); - MACE_CHECK(registry_.count(op_type) != 0, - op_type, " operation is not registered."); - - std::string key = OpKeyBuilder(op_type) - .Device(device_type) - .TypeConstraint("T", dtype) - .Build(); - if (registry_.at(op_type)->creators.count(key) == 0) { - LOG(FATAL) << "Key not registered: " << key; - } - return registry_.at(op_type)->creators.at(key)(context); - } else { - return nullptr; + const std::string op_type = context->operator_def()->type(); + MACE_CHECK(registry_.count(op_type) != 0, + op_type, " operation is not registered."); + + std::string key = OpKeyBuilder(op_type) + .Device(device_type) + .TypeConstraint("T", dtype) + .Build(); + if (registry_.at(op_type)->creators.count(key) == 0) { + LOG(FATAL) << "Key not registered: " << key; } + return registry_.at(op_type)->creators.at(key)(context); } } // namespace mace diff --git a/mace/core/operator.h b/mace/core/operator.h index 7017240c8194e9bbe2cac7fe06c85b534683e7f2..8d3e1557bd5673ea07ddc4b3008711e43a8e27c2 100644 --- a/mace/core/operator.h +++ b/mace/core/operator.h @@ -33,14 +33,13 @@ namespace mace { class OpConstructContext { public: explicit OpConstructContext(Workspace *ws); - OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device); ~OpConstructContext() = default; - inline void set_operator_def(OperatorDef *operator_def) { + inline void set_operator_def(std::shared_ptr operator_def) { operator_def_ = operator_def; } - inline OperatorDef *operator_def() const { + inline std::shared_ptr operator_def() const { return operator_def_; } @@ -56,10 +55,19 @@ class OpConstructContext { return device_; } + inline void set_output_mem_type(MemoryType type) { + output_mem_type_ = type; + } + + inline MemoryType output_mem_type() const { + return output_mem_type_; + } + private: - OperatorDef *operator_def_; + std::shared_ptr operator_def_; Workspace *ws_; Device *device_; + MemoryType output_mem_type_; // used for transform memory }; // memory_optimizer, device @@ -137,6 +145,10 @@ class Operation { inline bool has_debug_def() const { return operator_def_ != nullptr; } + inline std::shared_ptr operator_def() { + return operator_def_; + } + protected: std::shared_ptr operator_def_; std::vector inputs_; @@ -190,8 +202,7 @@ class OpRegistryBase { std::unique_ptr CreateOperation( OpConstructContext *context, - DeviceType device_type, - const NetMode mode) const; + DeviceType device_type) const; template static std::unique_ptr DefaultCreator( diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 18840b7107619adf94aca7ae739caa3358d33fd3..ee4eae5961cc2d2368c8a1aa41ebac40ddc7187f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -285,7 +285,8 @@ OpenCLRuntime::OpenCLRuntime( is_profiling_enabled_(false), opencl_version_(CL_VER_UNKNOWN), gpu_type_(UNKNOWN), - mem_type_(MemoryType::GPU_IMAGE) { + mem_type_(MemoryType::GPU_IMAGE), + scratch_image_manager_(new ScratchImageManager) { std::vector all_platforms; cl::Platform::get(&all_platforms); if (all_platforms.size() == 0) { @@ -791,4 +792,8 @@ bool OpenCLRuntime::is_profiling_enabled() const { return is_profiling_enabled_; } +ScratchImageManager* OpenCLRuntime::scratch_image_manager() const { + return scratch_image_manager_.get(); +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 3d182a9e0def86d6ecf73cd5140751b0d1702d31..ef1d50e1b9e1a47856f57bbdbb456c118c2c9dbf 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -25,6 +25,7 @@ #include "mace/core/file_storage.h" #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/scratch_image.h" #include "mace/proto/mace.pb.h" #include "mace/utils/string_util.h" #include "mace/utils/timer.h" @@ -82,6 +83,7 @@ class OpenCLRuntime { uint64_t device_global_mem_cache_size() const; uint32_t device_compute_units() const; Tuner *tuner(); + ScratchImageManager *scratch_image_manager() const; bool is_opencl_avaliable(); // TODO(liuqi): remove this function in the future, make decision at runtime. bool UseImageMemory(); @@ -134,6 +136,7 @@ class OpenCLRuntime { OpenCLVersion opencl_version_; GPUType gpu_type_; MemoryType mem_type_; + std::unique_ptr scratch_image_manager_; // All OpenCL object must be a pointer and manually deleted before unloading // OpenCL library. std::shared_ptr context_; diff --git a/mace/core/runtime/opencl/opencl_util.cc b/mace/core/runtime/opencl/opencl_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..02ffc8e02222492e9ec9f8d7a0688c9e3c49c5e7 --- /dev/null +++ b/mace/core/runtime/opencl/opencl_util.cc @@ -0,0 +1,181 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/opencl_util.h" + +#include + +#include "mace/utils/logging.h" + +namespace mace { + +namespace { +// [(C + 3) / 4 * W, N * H] +void CalInOutputImageShape(const std::vector &shape, /* NHWC */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2]; + (*image_shape)[1] = shape[0] * shape[1]; +} + +// [Ic, H * W * (Oc + 3) / 4] +void CalConv2dFilterImageShape(const std::vector &shape, /* OIHW */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = shape[1]; + (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]); +} + +// [H * W * M, (Ic + 3) / 4] +void CalDepthwiseConv2dFilterImageShape( + const std::vector &shape, /* MIHW */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = shape[0] * shape[2] * shape[3]; + (*image_shape)[1] = RoundUpDiv4(shape[1]); +} + +// [(size + 3) / 4, 1] +void CalArgImageShape(const std::vector &shape, + std::vector *image_shape) { + MACE_CHECK(shape.size() == 1); + image_shape->resize(2); + (*image_shape)[0] = RoundUpDiv4(shape[0]); + (*image_shape)[1] = 1; +} + +// Only support 3x3 now +// [ (Ic + 3) / 4, 16 * Oc] +void CalWinogradFilterImageShape( + const std::vector &shape, /* Oc, Ic, H, W*/ + std::vector *image_shape, + const int blk_size) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = RoundUpDiv4(shape[1]); + (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2)); +} + + +// [W * C, N * RoundUp<4>(H)] +void CalInOutHeightImageShape(const std::vector &shape, /* NHWC */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = shape[2] * shape[3]; + (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]); +} + +// [RoundUp<4>(W) * C, N * H] +void CalInOutWidthImageShape(const std::vector &shape, /* NHWC */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3]; + (*image_shape)[1] = shape[0] * shape[1]; +} + +// [Ic * H * W, (Oc + 3) / 4] +void CalWeightHeightImageShape(const std::vector &shape, /* OIHW */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = shape[1] * shape[2] * shape[3]; + (*image_shape)[1] = RoundUpDiv4(shape[0]); +} + +// [(Ic + 3) / 4 * H * W, Oc] +void CalWeightWidthImageShape(const std::vector &shape, /* OIHW */ + std::vector *image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape->resize(2); + (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3]; + (*image_shape)[1] = shape[0]; +} +} // namespace + +void OpenCLUtil::CalImage2DShape(const std::vector &shape, /* NHWC */ + const OpenCLBufferType type, + std::vector *image_shape, + const int wino_block_size) { + MACE_CHECK_NOTNULL(image_shape); + switch (type) { + case CONV2D_FILTER: + CalConv2dFilterImageShape(shape, image_shape); + break; + case DW_CONV2D_FILTER: + CalDepthwiseConv2dFilterImageShape(shape, image_shape); + break; + case IN_OUT_CHANNEL: + CalInOutputImageShape(shape, image_shape); + break; + case ARGUMENT: + CalArgImageShape(shape, image_shape); + break; + case IN_OUT_HEIGHT: + CalInOutHeightImageShape(shape, image_shape); + break; + case IN_OUT_WIDTH: + CalInOutWidthImageShape(shape, image_shape); + break; + case WINOGRAD_FILTER: + CalWinogradFilterImageShape(shape, image_shape, wino_block_size); + break; + case WEIGHT_HEIGHT: + CalWeightHeightImageShape(shape, image_shape); + break; + case WEIGHT_WIDTH: + CalWeightWidthImageShape(shape, image_shape); + break; + default: + LOG(FATAL) << "Mace not supported yet."; + } +} + +std::shared_ptr OpenCLUtil::CreateTransformOpDef( + const std::string &input_name, + const std::vector &input_shape, + const std::string &output_name, + const mace::DataType dt, + const mace::MemoryType mem_type) { + std::unique_ptr op(new OperatorDef); + std::string op_name = "mace_node_" + output_name; + op->set_name(op_name); + op->set_type("BufferTransform"); + op->add_input(input_name); + op->add_output(output_name); + Argument *arg = op->add_arg(); + arg->set_name("buffer_type"); + arg->set_i(static_cast(OpenCLBufferType::IN_OUT_CHANNEL)); + arg = op->add_arg(); + arg->set_name("mem_type"); + arg->set_i(static_cast(mem_type)); + arg = op->add_arg(); + arg->set_name("T"); + arg->set_i(static_cast(dt)); + arg = op->add_arg(); + arg->set_name("device"); + arg->set_i(DeviceType::GPU); + if (!input_shape.empty()) { + OutputShape *shape = op->add_output_shape(); + for (auto value : input_shape) { + shape->add_dims(value); + } + } + return std::move(op); +} +} // namespace mace diff --git a/mace/core/runtime/opencl/opencl_util.h b/mace/core/runtime/opencl/opencl_util.h new file mode 100644 index 0000000000000000000000000000000000000000..eb518317455dccebb6e05a7456765fbd0700f566 --- /dev/null +++ b/mace/core/runtime/opencl/opencl_util.h @@ -0,0 +1,54 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_ +#define MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_ + +#include +#include +#include + +#include "mace/core/types.h" + +namespace mace { +enum OpenCLBufferType { + CONV2D_FILTER = 0, + IN_OUT_CHANNEL = 1, + ARGUMENT = 2, + IN_OUT_HEIGHT = 3, + IN_OUT_WIDTH = 4, + WINOGRAD_FILTER = 5, + DW_CONV2D_FILTER = 6, + WEIGHT_HEIGHT = 7, + WEIGHT_WIDTH = 8, +}; + + +class OpenCLUtil { + public: + static void CalImage2DShape(const std::vector &shape, /* NHWC */ + const OpenCLBufferType type, + std::vector *image_shape, + const int wino_blk_size = 2); + + static std::shared_ptr CreateTransformOpDef( + const std::string &input_name, + const std::vector &input_shape, + const std::string &output_name, + const mace::DataType dt, + const MemoryType mem_type); +}; + +} // namespace mace +#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_UTIL_H_ diff --git a/mace/core/runtime/opencl/scratch_image.cc b/mace/core/runtime/opencl/scratch_image.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2d4dcfebca536e2ef99e37ac90cdd6194053108 --- /dev/null +++ b/mace/core/runtime/opencl/scratch_image.cc @@ -0,0 +1,84 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/scratch_image.h" + +#include +#include + +namespace mace { + +ScratchImageManager::ScratchImageManager() = default; +ScratchImageManager::~ScratchImageManager() = default; + +Image *ScratchImageManager::Spawn( + Allocator *allocator, + const std::vector &shape, + const DataType dt, + int *id) { + // TODO(liuqi): not optimal memory reuse strategy + int found_image_idx = -1; + int image_count = static_cast(reference_count_.size()); + for (int i = 0; i < image_count; ++i) { + int count = reference_count_[i]; + if (count == 0 && images_.at(count)->dtype() == dt) { + auto image_shape = images_.at(count)->image_shape(); + if (image_shape[0] >= shape[0] && image_shape[1] >= shape[1]) { + found_image_idx = i; + break; + } + } + } + // if not found + if (found_image_idx == -1) { + reference_count_.push_back(0); + images_[image_count] = + std::move(std::unique_ptr(new Image(allocator))); + if (images_.at(image_count)->Allocate(shape, dt) != + MaceStatus::MACE_SUCCESS) { + return nullptr; + } + found_image_idx = image_count; + VLOG(2) << "Spawn image " << found_image_idx << ": " << MakeString(shape) + << "<" << dt << ">"; + } + reference_count_[found_image_idx] += 1; + *id = found_image_idx; + return images_.at(found_image_idx).get(); +} + +void ScratchImageManager::Deactive(int id) { + MACE_CHECK(reference_count_.size() > static_cast(id) + && reference_count_[id] > 0, + "Image id ", id, " exceed the vector size ", + reference_count_.size()); + reference_count_[id] -= 1; +} + +ScratchImage::ScratchImage(mace::ScratchImageManager *manager) + : manager_(manager), id_(-1) {} + +ScratchImage::~ScratchImage() { + if (id_ >= 0) { + manager_->Deactive(id_); + } +} + +Image* ScratchImage::Scratch(Allocator *allocator, + const std::vector &shape, + const mace::DataType dt) { + return manager_->Spawn(allocator, shape, dt, &id_); +} + +} // namespace mace diff --git a/mace/core/runtime/opencl/scratch_image.h b/mace/core/runtime/opencl/scratch_image.h new file mode 100644 index 0000000000000000000000000000000000000000..adfe208f8a376878fa1319a4fd935ae4ec8a6102 --- /dev/null +++ b/mace/core/runtime/opencl/scratch_image.h @@ -0,0 +1,58 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_ +#define MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_ + +#include +#include +#include + +#include "mace/core/buffer.h" + +namespace mace { + +class ScratchImageManager { + public: + ScratchImageManager(); + ~ScratchImageManager(); + + Image *Spawn(Allocator *allocator, + const std::vector &shape, + const DataType dt, + int *id); + + void Deactive(int id); + + private: + std::unordered_map> images_; + std::vector reference_count_; +}; + +class ScratchImage { + public: + explicit ScratchImage(ScratchImageManager *); + ~ScratchImage(); + + Image *Scratch(Allocator *allocator, + const std::vector &shape, + const DataType dt); + + private: + ScratchImageManager *manager_; + int id_; +}; + +} // namespace mace +#endif // MACE_CORE_RUNTIME_OPENCL_SCRATCH_IMAGE_H_ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index f217bee42f7b6615453704c375e79d08cb1c4666..22d5f77270fc030c6915805c850ef2bb379ee489 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -97,7 +97,7 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { } } // namespace numerical_chars -enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4, OHWI = 5 }; +enum FilterDataFormat { HWOI = 100, OIHW = 101, HWIO = 102, OHWI = 103 }; class Tensor { public: @@ -223,7 +223,7 @@ class Tensor { } inline MemoryType memory_type() const { - MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty" ); + MACE_CHECK(buffer_ != nullptr, "Tensor ", name_, " is empty"); if (buffer_->OnHost()) { return MemoryType::CPU_BUFFER; } else if (typeid(*buffer_) == typeid(Image)) { @@ -233,6 +233,14 @@ class Tensor { } } + inline void set_data_format(DataFormat data_format) { + data_format_ = data_format; + } + + inline DataFormat data_format() const { + return data_format_; + } + #ifdef MACE_ENABLE_OPENCL inline cl::Image *opencl_image() const { MACE_CHECK(has_opencl_image(), name_, " do not have image"); @@ -499,6 +507,7 @@ class Tensor { int32_t zero_point_; float minval_; float maxval_; + DataFormat data_format_; // used for 4D input/output tensor MACE_DISABLE_COPY_AND_ASSIGN(Tensor); }; diff --git a/mace/core/transformer.h b/mace/core/transformer.h deleted file mode 100644 index 09f56009e0114dd5de9f017a3dbeb66dbff2eea3..0000000000000000000000000000000000000000 --- a/mace/core/transformer.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_CORE_TRANSFORMER_H_ -#define MACE_CORE_TRANSFORMER_H_ - -#include "mace/proto/mace.pb.h" - -namespace mace { - -class TransformerBase { - public: - // Construct transform operation. - virtual std::vector> ConstructTranformOp( - OperatorDef *op_def, - bool transform_filter = true) = 0; -}; - -} // namespace mace - -#endif // MACE_CORE_TRANSFORMER_H_ diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 766e125e20830cfc61891aaaf9f57dfd6eef8244..e98387eb31b9ee9f58923463ada94d7151753734 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -18,6 +18,7 @@ #include #include "mace/core/arg_helper.h" +#include "mace/core/memory_optimizer.h" #include "mace/utils/quantize.h" #ifdef MACE_ENABLE_OPENCL @@ -27,13 +28,6 @@ namespace mace { namespace { -bool ShouldPreallocateMemoryForOp(const OperatorDef &op) { - static const std::unordered_set reuse_buffer_ops { - "Reshape", "Identity", "Squeeze" - }; - return reuse_buffer_ops.find(op.type()) == reuse_buffer_ops.end(); -} - bool HasQuantizeOp(const NetDef &net_def) { for (auto &op : net_def.op()) { if (op.type() == "Quantize") { @@ -48,13 +42,14 @@ Workspace::Workspace() = default; Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, - DataType type) { + DataType type, + bool is_weight) { if (HasTensor(name)) { VLOG(3) << "Tensor " << name << " already exists. Skipping."; } else { VLOG(3) << "Creating Tensor " << name; tensor_map_[name] = std::unique_ptr(new Tensor(alloc, type, - false, name)); + is_weight, name)); } return GetTensor(name); } @@ -199,13 +194,79 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, fused_buffer_ = true; } } + return MaceStatus::MACE_SUCCESS; +} - if (device_type == DeviceType::CPU || device_type == DeviceType::GPU) { - MaceStatus status = CreateOutputTensorBuffer(net_def, device); - if (status != MaceStatus::MACE_SUCCESS) return status; +MaceStatus Workspace::PreallocateOutputTensor( + const mace::NetDef &net_def, + const mace::MemoryOptimizer *mem_optimizer, + Device *device) { + auto &mem_blocks = mem_optimizer->mem_blocks(); + for (auto &mem_block : mem_blocks) { + VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id() + << ", memory type: " << mem_block.mem_type() + << ", size: " << mem_block.x() << "x" << mem_block.y(); + if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(GetCPUAllocator())); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { + std::unique_ptr image_buf( + new Image(device->allocator())); + MACE_RETURN_IF_ERROR(image_buf->Allocate( + {static_cast(mem_block.x()), + static_cast(mem_block.y())}, mem_block.data_type())); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(image_buf)); + } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { + std::unique_ptr tensor_buf( + new Buffer(device->allocator())); + MACE_RETURN_IF_ERROR(tensor_buf->Allocate( + mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), + std::move(tensor_buf)); + } + } + VLOG(1) << "Preallocate buffer to tensors"; + bool is_quantize_model = IsQuantizedModel(net_def); + for (auto &tensor_mem : mem_optimizer->tensor_mem_map()) { + std::unique_ptr tensor + (new Tensor(preallocated_allocator_.GetBuffer(tensor_mem.second.first), + tensor_mem.second.second, + false, tensor_mem.first)); + if (mem_blocks[tensor_mem.second.first].mem_type() + == MemoryType::GPU_IMAGE) { + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.first + << " Data type: " << tensor->dtype() + << " Image shape: " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[0] + << ", " + << dynamic_cast(tensor->UnderlyingBuffer()) + ->image_shape()[1]; + tensor->set_data_format(DataFormat::NHWC); + } else { + VLOG(1) << "Tensor: " << tensor_mem.first + << " Mem: " << tensor_mem.second.first + << " Data type: " << tensor->dtype() + << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); + if (mem_blocks[tensor_mem.second.first].mem_type() + == MemoryType::GPU_BUFFER || + is_quantize_model) { + tensor->set_data_format(DataFormat::NHWC); + } else { + tensor->set_data_format(DataFormat::NCHW); + } + } + tensor_map_[tensor_mem.first] = std::move(tensor); } - if (device_type == DeviceType::CPU) { + // add quantize info for output tensors. + if (device->device_type() == DeviceType::CPU) { for (const auto &op : net_def.op()) { VLOG(2) << "Add quantize info for op: " << op.name(); MACE_CHECK(op.quantize_info().empty() @@ -225,139 +286,6 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, return MaceStatus::MACE_SUCCESS; } -MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def, - Device *device) { - DeviceType device_type = device->device_type(); - DataType dtype = DataType::DT_INVALID; - if (net_def.mem_arena().mem_block_size() > 0) { - // We use the data type of the first op with mem id, - // as CPU&GPU have consistent data type for each layer for now. - // As DSP may have different data output type for each op, - // we stick to the same concept. - for (auto &op : net_def.op()) { - // TODO(liuqi): refactor to add device_type to OperatorDef - const int op_device = - ProtoArgHelper::GetOptionalArg( - op, "device", static_cast(device_type)); - if (op_device == device_type && !op.mem_id().empty()) { - const DataType op_dtype = static_cast( - ProtoArgHelper::GetOptionalArg( - op, "T", static_cast(DT_FLOAT))); - if (op_dtype != DataType::DT_INVALID) { - dtype = op_dtype; - // find first valid data type, break - break; - } - } - } - MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); - } - // TODO(liyin): memory block should not have concept of type, but to be - // consistent with gpu, all memory block use float/half as unit - for (auto &mem_block : net_def.mem_arena().mem_block()) { - if (mem_block.device_type() == device_type) { - VLOG(3) << "Preallocate memory block. id: " << mem_block.mem_id() - << ", device type: " << mem_block.device_type() - << ", memory type: " << mem_block.mem_type(); - if (mem_block.mem_type() == MemoryType::CPU_BUFFER) { - std::unique_ptr tensor_buf( - new Buffer(GetCPUAllocator())); - MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() + MACE_EXTRA_BUFFER_PAD_SIZE)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(tensor_buf)); - } else if (mem_block.mem_type() == MemoryType::GPU_IMAGE) { - std::unique_ptr image_buf( - new Image(device->allocator())); - MACE_RETURN_IF_ERROR(image_buf->Allocate( - {mem_block.x(), mem_block.y()}, dtype)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(image_buf)); - } else if (mem_block.mem_type() == MemoryType::GPU_BUFFER) { - std::unique_ptr tensor_buf( - new Buffer(device->allocator())); - MACE_RETURN_IF_ERROR(tensor_buf->Allocate( - mem_block.x() * GetEnumTypeSize(dtype) - + MACE_EXTRA_BUFFER_PAD_SIZE)); - preallocated_allocator_.SetBuffer(mem_block.mem_id(), - std::move(tensor_buf)); - } - } - } - VLOG(3) << "Preallocate buffer to tensors"; - for (auto &op : net_def.op()) { - // TODO(liuqi): refactor to add device_type to OperatorDef - const int op_device = - ProtoArgHelper::GetOptionalArg( - op, "device", static_cast(device_type)); - if (op_device == device_type) { - if (!op.mem_id().empty() - && ShouldPreallocateMemoryForOp(op)) { - auto mem_ids = op.mem_id(); - int count = mem_ids.size(); - for (int i = 0; i < count; ++i) { - DataType output_type; - if (i < op.output_type_size()) { - output_type = op.output_type(i); - } else { - output_type = dtype; - } - std::unique_ptr tensor - (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), - output_type, false, op.output(i))); - if (device_type == DeviceType::GPU && tensor->has_opencl_image()) { - VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" - << " Mem: " << mem_ids[i] - << " Image shape: " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[0] - << ", " - << dynamic_cast(tensor->UnderlyingBuffer()) - ->image_shape()[1]; - } else { - VLOG(3) << "Tensor: " << op.output(i) << "(" << op.type() << ")" - << " Mem: " << mem_ids[i] - << ", Buffer size: " << tensor->UnderlyingBuffer()->size(); - } - tensor_map_[op.output(i)] = std::move(tensor); - } - } else { - for (int i = 0; i < op.output().size(); ++i) { - MACE_CHECK( - op.output_type_size() == 0 - || op.output_size() - == op.output_type_size(), - "operator output size != operator output type size", - op.output_size(), - op.output_type_size()); - DataType output_type; - if (i < op.output_type_size()) { - output_type = op.output_type(i); - } else { - output_type = static_cast(ProtoArgHelper::GetOptionalArg( - op, "T", static_cast(DT_FLOAT))); - } - CreateTensor(op.output(i), - device->allocator(), - output_type); - } - } - - for (int output_idx = 0; output_idx < op.output_shape_size(); - ++output_idx) { - std::vector - shape_configured(op.output_shape(output_idx).dims_size()); - for (size_t dim = 0; dim < shape_configured.size(); ++dim) { - shape_configured[dim] = op.output_shape(output_idx).dims(dim); - } - tensor_map_[op.output(output_idx)]->SetShapeConfigured( - shape_configured); - } - } - } - return MaceStatus::MACE_SUCCESS; -} - void Workspace::RemoveUnusedBuffer() { auto iter = tensor_map_.begin(); auto end_iter = tensor_map_.end(); @@ -398,4 +326,11 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, tensor_buffer_.reset(nullptr); } +void Workspace::RemoveTensor(const std::string &name) { + auto iter = tensor_map_.find(name); + if (iter != tensor_map_.end()) { + tensor_map_.erase(iter); + } +} + } // namespace mace diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 2a8089370c01c4341d6cd94a775ee6eaf1443910..e1d0a9829429cec5bd20c6b9d94aa73a574167a3 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -27,6 +27,8 @@ namespace mace { +class MemoryOptimizer; + class Workspace { public: typedef std::map> TensorMap; @@ -36,7 +38,8 @@ class Workspace { Tensor *CreateTensor(const std::string &name, Allocator *alloc, - DataType type); + DataType type, + bool is_weight = false); inline bool HasTensor(const std::string &name) const { return tensor_map_.find(name) != tensor_map_.end(); @@ -52,12 +55,19 @@ class Workspace { Device *device, const unsigned char *model_data); + MaceStatus PreallocateOutputTensor(const NetDef &net_def, + const MemoryOptimizer *mem_optimizer, + Device *device); + void RemoveUnusedBuffer(); void RemoveAndReloadBuffer(const NetDef &net_def, const unsigned char *model_data, Allocator *alloc); + void RemoveTensor(const std::string &name); + + private: MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, Device *device); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index c4d65f7bd5496ef5e3e8afbe816c38dbe5cd12ef..bd94886bf728507ffc0f3d22b910f5c0b5bf5198 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -20,9 +20,11 @@ #include -#include "mace/core/net.h" #include "mace/core/device_context.h" +#include "mace/core/memory_optimizer.h" +#include "mace/core/net.h" #include "mace/ops/ops_registry.h" +#include "mace/ops/transpose.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL @@ -69,6 +71,7 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { // Check OpenCL avaliable auto runtime = device->opencl_runtime(); if (!runtime->is_opencl_avaliable()) { + LOG(WARNING) << "The device does not support OpenCL"; return MaceStatus::MACE_OUT_OF_RESOURCES; } @@ -84,28 +87,6 @@ MaceStatus CheckGPUAvalibility(const NetDef *net_def, Device *device) { const MemoryType mem_type = static_cast(mem_type_i); runtime->set_mem_type(mem_type); - if (mem_type == MemoryType::GPU_IMAGE) { - if (!runtime->IsImageSupport()) { - return MaceStatus::MACE_OUT_OF_RESOURCES; - } - - auto opencl_max_image_size = runtime->GetMaxImage2DSize(); - if (opencl_max_image_size.empty()) { - return MaceStatus::MACE_OUT_OF_RESOURCES; - } - - const std::vector net_max_image_size = - ProtoArgHelper::GetRepeatedArgs( - *net_def, "opencl_max_image_size", {0, 0}); - - if (static_cast(net_max_image_size[0]) > opencl_max_image_size[0] - || static_cast(net_max_image_size[1]) - > opencl_max_image_size[1]) { - LOG(INFO) << "opencl max image size " << MakeString(opencl_max_image_size) - << " vs " << MakeString(net_max_image_size); - return MaceStatus::MACE_OUT_OF_RESOURCES; - } - } return MaceStatus::MACE_SUCCESS; } @@ -288,14 +269,17 @@ class MaceTensor::Impl { public: std::vector shape; std::shared_ptr data; + DataFormat format; }; MaceTensor::MaceTensor(const std::vector &shape, - std::shared_ptr data) { + std::shared_ptr data, + const DataFormat format) { MACE_CHECK_NOTNULL(data.get()); impl_ = std::unique_ptr(new MaceTensor::Impl()); impl_->shape = shape; impl_->data = data; + impl_->format = format; } MaceTensor::MaceTensor() { @@ -306,23 +290,27 @@ MaceTensor::MaceTensor(const MaceTensor &other) { impl_ = std::unique_ptr(new MaceTensor::Impl()); impl_->shape = other.shape(); impl_->data = other.data(); + impl_->format = other.data_format(); } MaceTensor::MaceTensor(const MaceTensor &&other) { impl_ = std::unique_ptr(new MaceTensor::Impl()); impl_->shape = other.shape(); impl_->data = other.data(); + impl_->format = other.data_format(); } MaceTensor &MaceTensor::operator=(const MaceTensor &other) { impl_->shape = other.shape(); impl_->data = other.data(); + impl_->format = other.data_format(); return *this; } MaceTensor &MaceTensor::operator=(const MaceTensor &&other) { impl_->shape = other.shape(); impl_->data = other.data(); + impl_->format = other.data_format(); return *this; } @@ -334,6 +322,10 @@ const std::shared_ptr MaceTensor::data() const { return impl_->data; } std::shared_ptr MaceTensor::data() { return impl_->data; } +DataFormat MaceTensor::data_format() const { + return impl_->format; +} + // Mace Engine class MaceEngine::Impl { public: @@ -355,6 +347,14 @@ class MaceEngine::Impl { std::map *outputs, RunMetadata *run_metadata); + private: + MaceStatus TransposeInput( + const std::pair &input, + Tensor *input_tensor); + + MaceStatus TransposeOutput(const Tensor *output_tensor, + std::pair *output); + private: const unsigned char *model_data_; size_t model_data_size_; @@ -363,11 +363,12 @@ class MaceEngine::Impl { std::unique_ptr device_; std::unique_ptr ws_; std::unique_ptr net_; - std::map input_info_map_; - std::map output_info_map_; + bool is_quantized_model_; #ifdef MACE_ENABLE_HEXAGON std::unique_ptr hexagon_controller_; #endif + std::map input_info_map_; + std::map output_info_map_; MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; @@ -379,7 +380,8 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) device_type_(config.impl_->device_type()), device_(nullptr), ws_(new Workspace()), - net_(nullptr) + net_(nullptr), + is_quantized_model_(false) #ifdef MACE_ENABLE_HEXAGON , hexagon_controller_(nullptr) #endif @@ -417,6 +419,8 @@ MaceStatus MaceEngine::Impl::Init( MACE_RETURN_IF_ERROR(CheckGPUAvalibility(net_def, device_.get())); } #endif + // mark quantized model flag + is_quantized_model_ = IsQuantizedModel(*net_def); // Get input and output information. for (auto &input_info : net_def->input_info()) { input_info_map_[input_info.name()] = input_info; @@ -431,8 +435,7 @@ MaceStatus MaceEngine::Impl::Init( << "' does not belong to model's inputs: " << MakeString(MapKeys(input_info_map_)); } - ws_->CreateTensor(MakeString("mace_input_node_", input_name), - device_->allocator(), DT_FLOAT); + ws_->CreateTensor(input_name, device_->allocator(), DT_FLOAT); } for (auto output_name : output_nodes) { if (output_info_map_.find(output_name) == output_info_map_.end()) { @@ -440,8 +443,6 @@ MaceStatus MaceEngine::Impl::Init( << "' does not belong to model's outputs " << MakeString(MapKeys(output_info_map_)); } - ws_->CreateTensor(MakeString("mace_output_node_", output_name), - device_->allocator(), DT_FLOAT); } #ifdef MACE_ENABLE_HEXAGON if (device_type_ == HEXAGON) { @@ -461,19 +462,19 @@ MaceStatus MaceEngine::Impl::Init( device_.get(), model_data)); + MemoryOptimizer mem_optimizer; // Init model - auto net = std::unique_ptr(new SerialNet( - op_registry_.get(), - net_def, - ws_.get(), - device_.get(), - NetMode::INIT)); - MACE_RETURN_IF_ERROR(net->Init()); - MACE_RETURN_IF_ERROR(net->Run()); net_ = std::unique_ptr(new SerialNet(op_registry_.get(), net_def, ws_.get(), - device_.get())); + device_.get(), + &mem_optimizer)); + + // Preallocate all output tensors of ops + MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, + &mem_optimizer, + device_.get())); + MACE_RETURN_IF_ERROR(net_->Init()); #ifdef MACE_ENABLE_HEXAGON } @@ -524,6 +525,117 @@ MaceEngine::Impl::~Impl() { #endif } +MaceStatus MaceEngine::Impl::TransposeInput( + const std::pair &input, + Tensor *input_tensor) { + if (device_->device_type() == DeviceType::CPU && + input.second.shape().size() == 4 && + input.second.data_format() == NHWC && + !is_quantized_model_) { + VLOG(1) << "Transform input " << input.first << " from NHWC to NCHW"; + input_tensor->set_data_format(DataFormat::NCHW); + std::vector dst_dims = {0, 3, 1, 2}; + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } else if ( + (is_quantized_model_ || device_->device_type() == DeviceType::GPU) && + input.second.shape().size() == 4 && + input.second.data_format() == DataFormat::NCHW) { + VLOG(1) << "Transform input " << input.first << " from NCHW to NHWC"; + std::vector dst_dims = {0, 2, 3, 1}; + input_tensor->set_data_format(DataFormat::NHWC); + std::vector output_shape = + TransposeShape(input.second.shape(), dst_dims); + MACE_RETURN_IF_ERROR(input_tensor->Resize(output_shape)); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + return ops::Transpose(input.second.data().get(), + input.second.shape(), + dst_dims, + input_data); + } else { + input_tensor->set_data_format(input.second.data_format()); + MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); + Tensor::MappingGuard input_guard(input_tensor); + float *input_data = input_tensor->mutable_data(); + memcpy(input_data, input.second.data().get(), + input_tensor->size() * sizeof(float)); + return MaceStatus::MACE_SUCCESS; + } +} + +MaceStatus MaceEngine::Impl::TransposeOutput( + const mace::Tensor *output_tensor, + std::pair *output) { + // save output + if (output_tensor != nullptr && output->second.data() != nullptr) { + if (device_->device_type() == DeviceType::CPU && + output->second.shape().size() == 4 && + output->second.data_format() != output_tensor->data_format()) { + MACE_CHECK(output_tensor->data_format() == NCHW); + VLOG(1) << "Transform output " << output->first << " from NCHW to NHWC"; + std::vector dst_dims = {0, 2, 3, 1}; + std::vector shape = + TransposeShape(output_tensor->shape(), + dst_dims); + MACE_CHECK(shape == output->second.shape()) + << "Output shape mismatch: " + << MakeString(shape) << " != " + << MakeString(output->second.shape()); + Tensor::MappingGuard output_guard(output_tensor); + const float *output_data = output_tensor->data(); + return ops::Transpose(output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); + } else if (device_->device_type() == DeviceType::GPU && + output->second.shape().size() == 4 && + output->second.data_format() != output_tensor->data_format()) { + VLOG(1) << "Transform output " << output->first << " from " + << output_tensor->data_format() << " to " + << output->second.data_format(); + std::vector dst_dims = {0, 3, 1, 2}; + if (output_tensor->data_format() == NCHW) { + dst_dims = {0, 2, 3, 1}; + } + std::vector shape = + TransposeShape(output_tensor->shape(), + dst_dims); + MACE_CHECK(shape == output->second.shape()) + << "Output shape mismatch: " + << MakeString(shape) << " != " + << MakeString(output->second.shape()); + Tensor::MappingGuard output_guard(output_tensor); + const float *output_data = output_tensor->data(); + return ops::Transpose(output_data, + output_tensor->shape(), + dst_dims, + output->second.data().get()); + } else { + Tensor::MappingGuard output_guard(output_tensor); + auto shape = output_tensor->shape(); + int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()); + MACE_CHECK(shape == output->second.shape()) + << "Output shape mismatch: " + << MakeString(shape) << " != " + << MakeString(output->second.shape()); + std::memcpy(output->second.data().get(), output_tensor->data(), + output_size * sizeof(float)); + return MaceStatus::MACE_SUCCESS; + } + } else { + return MaceStatus::MACE_INVALID_ARGS; + } +} + MaceStatus MaceEngine::Impl::Run( const std::map &inputs, std::map *outputs, @@ -537,15 +649,8 @@ MaceStatus MaceEngine::Impl::Run( << "' does not belong to model's inputs: " << MakeString(MapKeys(input_info_map_)); } - Tensor *input_tensor = - ws_->GetTensor(MakeString("mace_input_node_", input.first)); - MACE_RETURN_IF_ERROR(input_tensor->Resize(input.second.shape())); - { - Tensor::MappingGuard input_guard(input_tensor); - float *input_data = input_tensor->mutable_data(); - memcpy(input_data, input.second.data().get(), - input_tensor->size() * sizeof(float)); - } + Tensor *input_tensor = ws_->GetTensor(input.first); + MACE_RETURN_IF_ERROR(TransposeInput(input, input_tensor)); input_tensors.push_back(input_tensor); } for (auto &output : *outputs) { @@ -554,8 +659,7 @@ MaceStatus MaceEngine::Impl::Run( << "' does not belong to model's outputs: " << MakeString(MapKeys(output_info_map_)); } - Tensor *output_tensor = - ws_->GetTensor(MakeString("mace_output_node_", output.first)); + Tensor *output_tensor = ws_->GetTensor(output.first); output_tensors.push_back(output_tensor); } #ifdef MACE_ENABLE_HEXAGON @@ -577,23 +681,9 @@ MaceStatus MaceEngine::Impl::Run( } #endif for (auto &output : *outputs) { - Tensor *output_tensor = - ws_->GetTensor(MakeString("mace_output_node_", output.first)); + Tensor *output_tensor = ws_->GetTensor(output.first); // save output - if (output_tensor != nullptr && output.second.data() != nullptr) { - Tensor::MappingGuard output_guard(output_tensor); - auto shape = output_tensor->shape(); - int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1, - std::multiplies()); - MACE_CHECK(shape == output.second.shape()) - << "Output shape mismatch: " - << MakeString(output.second.shape()) - << " != " << MakeString(shape); - std::memcpy(output.second.data().get(), output_tensor->data(), - output_size * sizeof(float)); - } else { - return MaceStatus::MACE_INVALID_ARGS; - } + MACE_RETURN_IF_ERROR(TransposeOutput(output_tensor, &output)); } return MaceStatus::MACE_SUCCESS; } diff --git a/mace/libmace/mace_version_script.lds b/mace/libmace/mace_version_script.lds index 04e88455f67c209c0e6c7d70cce12167a81fbad5..9b7d34538ad20417e59051420048e98998c5afd7 100644 --- a/mace/libmace/mace_version_script.lds +++ b/mace/libmace/mace_version_script.lds @@ -14,7 +14,6 @@ mace { *mace*NetDef*; *mace*MemoryType*; *mace*DataType*; - *mace*MemoryArena*; *mace*InputInfo*; *mace*OutputInfo*; *mace*OutputShape*; diff --git a/mace/ops/BUILD b/mace/ops/BUILD index 5fb0683cf1be1d8936ec411877f4f3492ac1f960..99a50f7fe1a3cf92149623eb3395271cca580ad7 100644 --- a/mace/ops/BUILD +++ b/mace/ops/BUILD @@ -30,10 +30,8 @@ cc_library( "arm/*_test.cc", "ops_registry.cc", "ops_test_util.cc", - "buffer_inverse_transform.cc", - "buffer_transformer.cc", + "buffer_transform.cc", "lstm_cell.cc", - "winograd_transform.cc", "quantize.cc", ], ) + if_opencl_enabled(glob( @@ -41,10 +39,8 @@ cc_library( "opencl/*.cc", "opencl/image/*.cc", "opencl/buffer/*.cc", - "buffer_inverse_transform.cc", - "buffer_transformer.cc", + "buffer_transform.cc", "lstm_cell.cc", - "winograd_transform.cc", ], exclude = [ "opencl/*_test.cc", diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index 19b3289fcb70b16344edd4dcd8f80552ba6f389a..b904b5c275373e48f59358b8a238f61dd6917bf6 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -90,7 +90,7 @@ class ActivationOp : public Operation { } if (type == ActivationType::PRELU) { MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } } diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index bd766047169bdf67c9cba4dfdb186c3883655e01..76447e9b6134229a002ac94bb09f58b2f857d038 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -30,31 +30,19 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) { // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { - net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("Activation", "ReluBM") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluBM") - .Input("InputImage") - .Output("Output") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } + OpDefBuilder("Activation", "ReluBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { @@ -100,29 +88,18 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) { // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else { - net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluxBM") - .Input("InputImage") - .Output("Output") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6.0) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Activation", "ReluxBM") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6.0) - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("Activation", "ReluxBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6.0) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { @@ -168,36 +145,21 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) { // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { - net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; } - net.AddRandomInput("Alpha", {channels}); + net.AddRandomInput("Alpha", {channels}, true); - if (D == DeviceType::CPU) { - OpDefBuilder("Activation", "PreluBM") - .Input("Input") - .Input("Alpha") - .Output("Output") - .AddStringArg("activation", "PRELU") - .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Alpha", "AlphaImage", - ops::BufferType::ARGUMENT); - - OpDefBuilder("Activation", "PreluBM") - .Input("InputImage") - .Input("AlphaImage") - .Output("Output") - .AddStringArg("activation", "PRELU") - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } + OpDefBuilder("Activation", "PreluBM") + .Input("Input") + .Input("Alpha") + .Output("Output") + .AddStringArg("activation", "PRELU") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { @@ -243,27 +205,17 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) { // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else { - net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "TanhBM") - .Input("InputImage") - .Output("Output") - .AddStringArg("activation", "TANH") - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Activation", "TanhBM") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "TANH") - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("Activation", "TanhBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "TANH") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { @@ -310,27 +262,17 @@ void SigmoidBenchmark( // Add input data if (D == DeviceType::CPU) { - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, channels, height, width}); } else { - net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "SigmoidBM") - .Input("InputImage") - .Output("Output") - .AddStringArg("activation", "SIGMOID") - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Activation", "SigmoidBM") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "SIGMOID") - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("Activation", "SigmoidBM") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "SIGMOID") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index f127be425f9be4478f0fdf7fbadcaeb2ff6bc0a8..4cd63ab6b070d36d45c6547ee7bbe4ea9c2ebf0e 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -30,32 +30,14 @@ void TestSimpleRelu() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); + OpDefBuilder("Activation", "ReluTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "ReluTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); @@ -78,32 +60,14 @@ void TestUnalignedSimpleRelu() { // Add input data net.AddInputFromArray("Input", {1, 3, 2, 1}, {-7, 7, -6, 6, -5, 5}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "ReluTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELU") - .Finalize(net.NewOperatorDef()); + OpDefBuilder("Activation", "ReluTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELU") + .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor({1, 3, 2, 1}, {0, 7, 0, 6, 0, 5}); @@ -129,34 +93,15 @@ void TestSimpleRelux() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluxTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); + OpDefBuilder("Activation", "ReluxTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "ReluxTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); @@ -179,34 +124,15 @@ void TestSimpleReluRelux() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "ReluxTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "ReluxTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "RELUX") - .AddFloatArg("max_limit", 6) - .Finalize(net.NewOperatorDef()); + OpDefBuilder("Activation", "ReluxTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "RELUX") + .AddFloatArg("max_limit", 6) + .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor( {2, 2, 2, 2}, {0, 6, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0}); @@ -232,43 +158,36 @@ void TestSimplePrelu() { net.AddInputFromArray( "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0}); - net.AddInputFromArray("Alpha", {2}, {2.0, 3.0}); + net.AddInputFromArray("Alpha", {2}, {2.0, 3.0}, true); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("Activation", "PreluTest") - .Input("InputImage") + .Input("Input") .Input("Alpha") - .Output("OutputImage") + .Output("Output") .AddStringArg("activation", "PRELU") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { + net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); OpDefBuilder("Activation", "PreluTest") - .Input("Input") + .Input("InputNCHW") .Input("Alpha") - .Output("Output") + .Output("OutputNCHW") .AddStringArg("activation", "PRELU") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); + net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } - if (D == DeviceType::CPU) { - auto expected = net.CreateTensor( - {2, 2, 2, 2}, - {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); - } + auto expected = net.CreateTensor( + {2, 2, 2, 2}, + {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace @@ -288,32 +207,14 @@ void TestSimpleTanh() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "TanhTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "TANH") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "TanhTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "TANH") - .Finalize(net.NewOperatorDef()); + OpDefBuilder("Activation", "TanhTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "TANH") + .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor( {2, 2, 2, 2}, @@ -341,32 +242,14 @@ void TestSimpleSigmoid() { "Input", {2, 2, 2, 2}, {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Activation", "SigmoidTest") - .Input("InputImage") - .Output("OutputImage") - .AddStringArg("activation", "SIGMOID") - .Finalize(net.NewOperatorDef()); + OpDefBuilder("Activation", "SigmoidTest") + .Input("Input") + .Output("Output") + .AddStringArg("activation", "SIGMOID") + .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("Activation", "SigmoidTest") - .Input("Input") - .Output("Output") - .AddStringArg("activation", "SIGMOID") - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); - } + // Run + net.RunOp(D); auto expected = net.CreateTensor( {2, 2, 2, 2}, diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 5db2bda4bd67a818019b6a163ec912cb80799151..f5e11740d79597bc02e9f2fba3c55a6e286b8a7c 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -32,28 +32,13 @@ void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { net.AddRandomInput(MakeString("Input", i).c_str(), {n, h, w, c}); } - if (D == DeviceType::GPU) { - for (int i = 0; i < inputs; ++i) { - BufferToImage(&net, MakeString("Input", i).c_str(), - MakeString("InputImage", i).c_str(), - ops::BufferType::IN_OUT_CHANNEL); - } - OpDefBuilder op_def_builder("AddN", "AddNBM"); - for (int i = 0; i < inputs; ++i) { - op_def_builder.Input(MakeString("InputImage", i).c_str()); - } - op_def_builder.Output("OutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder op_def_builder("AddN", "AddNBM"); - for (int i = 0; i < inputs; ++i) { - op_def_builder.Input(MakeString("Input", i).c_str()); - } - op_def_builder.Output("Output") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + OpDefBuilder op_def_builder("AddN", "AddNBM"); + for (int i = 0; i < inputs; ++i) { + op_def_builder.Input(MakeString("Input", i).c_str()); } + op_def_builder.Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 865fdd7f95159f5f9f1030376583fbdb5f40e1e1..f006570c49627de85ab61e4b55d74f9d825c12ac 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -62,39 +62,15 @@ void SimpleAdd3() { net.AddInputFromArray("Input3", {1, 2, 3, 1}, {-0.1582, 2, 3, 4, 5, 6}); - const int input_num = 4; - if (D == DeviceType::GPU) { - // run on gpu - for (int i = 0; i < input_num; ++i) { - BufferToImage(&net, MakeString("Input", i), - MakeString("InputImage", i), - ops::BufferType::IN_OUT_CHANNEL); - } - - auto op_def_cl = OpDefBuilder("AddN", "AddNTest"); - for (int i = 0; i < input_num; ++i) { - op_def_cl.Input(MakeString("InputImage", i)); - } - op_def_cl.Output("OutputImage") - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); - - // Run on device - net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { - OpDefBuilder("AddN", "AddNTest") - .Input("Input0") - .Input("Input1") - .Input("Input2") - .Input("Input3") - .Output("Output") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - } + OpDefBuilder("AddN", "AddNTest") + .Input("Input0") + .Input("Input1") + .Input("Input2") + .Input("Input3") + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); auto expected = net.CreateTensor({1, 2, 3, 1}, {-0.000713, 8, 12, 16, 20, 24}); @@ -138,28 +114,10 @@ void RandomTest() { auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - // run on gpu - for (int i = 0; i < input_num; ++i) { - BufferToImage(&net, MakeString("Input", i), - MakeString("InputImage", i), - ops::BufferType::IN_OUT_CHANNEL); - } - - auto op_def_cl = OpDefBuilder("AddN", "AddNTest"); - for (int i = 0; i < input_num; ++i) { - op_def_cl.Input(MakeString("InputImage", i)); - } - op_def_cl.Output("OutputImage") - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); - - // Run on device + // run on device net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } } diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index cf022d6ae7a2e9cee8bf4368869f8e8eab9faf68..1758f79b799a11df6b075222ffb022be5a71b615 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -158,14 +158,16 @@ class BatchNormOp : public Operation { } // Transform filters int input_size = operator_def_->input_size(); - for (int i = 0; i < input_size; ++i) { + for (int i = 1; i < input_size; ++i) { const Tensor *input_tensor = context->workspace()->GetTensor( operator_def_->input(i)); - if (input_tensor != nullptr && input_tensor->is_weight()) { - MACE_CHECK(TransformFilter( - context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) - == MaceStatus::MACE_SUCCESS); - } + MACE_CHECK(input_tensor != nullptr); + MACE_CHECK(TransformFilter( + context, + operator_def_.get(), + i, + OpenCLBufferType::ARGUMENT, + mem_type) == MaceStatus::MACE_SUCCESS); } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 814b631e66a25a26d4faf45b23308c2476144319..d3467e769f32a69732b366e2d077f5fb6c8959e8 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -36,13 +36,12 @@ void BatchNorm( } else { MACE_NOT_IMPLEMENTED; } - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}, true); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); + net.AddRandomInput("Mean", {channels}, true); + net.AddRandomInput("Var", {channels}, true, true); - if (D == DeviceType::CPU) { - OpDefBuilder("BatchNorm", "BatchNormBM") + OpDefBuilder("BatchNorm", "BatchNormBM") .Input("Input") .Input("Scale") .Input("Offset") @@ -50,30 +49,8 @@ void BatchNorm( .Input("Var") .AddFloatArg("epsilon", 1e-3) .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormBM") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // tuning setenv("MACE_TUNING", "1", 1); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 214fd5075c328af4435563c52c3fadb120f39651..d7c4903e1449371a14830d057e8ede4c03cf0cea 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -28,10 +28,10 @@ void Simple() { // Add input data net.AddInputFromArray("Input", {1, 6, 2, 1}, {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); - net.AddInputFromArray("Scale", {1}, {4.0f}); - net.AddInputFromArray("Offset", {1}, {2.0}); - net.AddInputFromArray("Mean", {1}, {10}); - net.AddInputFromArray("Var", {1}, {11.67f}); + net.AddInputFromArray("Scale", {1}, {4.0f}, true); + net.AddInputFromArray("Offset", {1}, {2.0}, true); + net.AddInputFromArray("Mean", {1}, {10}, true); + net.AddInputFromArray("Var", {1}, {11.67f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -49,32 +49,17 @@ void Simple() { net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } // Check @@ -103,10 +88,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); + net.AddRandomInput("Mean", {channels}, true); + net.AddRandomInput("Var", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -133,25 +118,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") + .Output("Output") .Finalize(net.NewOperatorDef()); // Tuning @@ -162,10 +136,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // Run on opencl net.RunOp(DeviceType::GPU); net.Sync(); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-4); } @@ -183,10 +154,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); + net.AddRandomInput("Mean", {channels}, true); + net.AddRandomInput("Var", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -212,25 +183,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") .AddFloatArg("epsilon", 1e-1) - .Output("OutputImage") + .Output("Output") .AddIntArg("T", static_cast(DataType::DT_HALF)) .Finalize(net.NewOperatorDef()); @@ -243,9 +203,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-1, 1e-2); } @@ -263,10 +221,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); + net.AddRandomInput("Mean", {channels}, true); + net.AddRandomInput("Var", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -292,25 +250,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") + .Output("Output") .Finalize(net.NewOperatorDef()); // tuning @@ -322,9 +269,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-4); } @@ -342,10 +287,10 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); - net.AddRandomInput("Mean", {channels}); - net.AddRandomInput("Var", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); + net.AddRandomInput("Mean", {channels}, true); + net.AddRandomInput("Var", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -371,25 +316,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Mean", "MeanImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Var", "VarImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") .AddFloatArg("epsilon", 1e-1) - .Output("OutputImage") + .Output("Output") .AddIntArg("T", static_cast(DataType::DT_HALF)) .Finalize(net.NewOperatorDef()); @@ -402,9 +336,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-1, 1e-2); } diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 7ea19f6bcdb57e270092ecb21e497dd4b7ee7e3c..9664a917e6256687a7c0bba75a3c5cb52732071e 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -32,23 +32,13 @@ void BMBatchToSpace( net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::CPU) { - OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") - .Input("Input") - .Output("Output") - .AddIntsArg("crops", {0, 0, 0, 0}) - .AddIntsArg("block_shape", {arg, arg}) - .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("crops", {0, 0, 0, 0}) - .AddIntsArg("block_shape", {arg, arg}) - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") + .Input("Input") + .Output("Output") + .AddIntsArg("crops", {0, 0, 0, 0}) + .AddIntsArg("block_shape", {arg, arg}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc index 0b406dd1269b2f2ad6925b232b2d845566836c9b..59579fa518bd613700251ee74b2265025337d58d 100644 --- a/mace/ops/bias_add.cc +++ b/mace/ops/bias_add.cc @@ -108,7 +108,7 @@ class BiasAddOp : public Operation { MACE_NOT_IMPLEMENTED; } MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, BufferType::ARGUMENT, mem_type) + context, operator_def_.get(), 1, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index dce361e94130ebb3fb9c55a50c822306228cbcf7..9026ffb2b2142b4b7d9d99c303401fc759ca0e05 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -28,35 +28,24 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) { OpsTestNet net; // Add input data + DataFormat data_format = NHWC; if (D == DeviceType::CPU) { + data_format = NCHW; net.AddRandomInput("Input", {batch, channels, height, width}); } else if (D == DeviceType::GPU) { net.AddRandomInput("Input", {batch, height, width, channels}); } else { MACE_NOT_IMPLEMENTED; } - net.AddRandomInput("Bias", {channels}, true); + net.AddRandomInput("Bias", {channels}, true, true); - if (D == DeviceType::CPU) { - OpDefBuilder("BiasAdd", "BiasAddBM") + OpDefBuilder("BiasAdd", "BiasAddBM") .Input("Input") .Input("Bias") - .AddIntArg("data_format", NCHW) + .AddIntArg("data_format", data_format) .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BiasAdd", "BiasAddBM") - .Input("InputImage") - .Input("BiasImage") - .Output("Output") - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index ba31ccec30e53b54f02e42890fc5060e6c7437b7..77c6e7c4a14b8fcf5be00805877b4717770cc732 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -28,7 +28,7 @@ void BiasAddSimple() { // Add input data net.AddInputFromArray("Input", {1, 6, 2, 1}, {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}); - net.AddInputFromArray("Bias", {1}, {0.5f}); + net.AddInputFromArray("Bias", {1}, {0.5f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -44,22 +44,13 @@ void BiasAddSimple() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BiasAdd", "BiasAddTest") - .Input("InputImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Bias") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -90,7 +81,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Bias", {channels}, true); + net.AddRandomInput("Bias", {channels}, true, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -113,25 +104,17 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - + // Run on gpu OpDefBuilder("BiasAdd", "BiasAddTest") - .Input("InputImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Bias") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run on opencl net.RunOp(DeviceType::GPU); - net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { @@ -147,7 +130,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Bias", {channels}, true); + net.AddRandomInput("Bias", {channels}, true, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -169,25 +152,17 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - + // Run on gpu OpDefBuilder("BiasAdd", "BiasAddTest") - .Input("InputImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Bias") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run on opencl net.RunOp(DeviceType::GPU); - net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace test diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc deleted file mode 100644 index 8482e2552a55c7e7d681a4e5239d510cc4f2bdfb..0000000000000000000000000000000000000000 --- a/mace/ops/buffer_inverse_transform.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "mace/core/operator.h" -#include "mace/ops/opencl/buffer/buffer_inverse_transform.h" -#include "mace/ops/opencl/image/image_to_buffer.h" - -namespace mace { -namespace ops { - -template -class BufferInverseTransformOp; - -template -class BufferInverseTransformOp : public Operation { - public: - explicit BufferInverseTransformOp(OpConstructContext *context) - : Operation(context), - wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::ImageToBuffer); - } else { - kernel_.reset(new opencl::buffer::BufferInverseTransform); - } - } - - MaceStatus Run(OpContext *context) override { - const Tensor *input = this->Input(0); - Tensor *output = this->Output(0); - - ops::BufferType type = - static_cast(Operation::GetOptionalArg( - "buffer_type", static_cast(ops::CONV2D_FILTER))); - - return kernel_->Compute(context, input, type, - wino_blk_size_, output); - } - - private: - const int wino_blk_size_; - std::unique_ptr kernel_; -}; - - -void RegisterBufferInverseTransform(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "BufferInverseTransform", - BufferInverseTransformOp, DeviceType::GPU, float); - - MACE_REGISTER_OP(op_registry, "BufferInverseTransform", - BufferInverseTransformOp, DeviceType::GPU, half); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc index 825ba1053361c3b897c3bf6e7a93b7918a7f7acf..f5f1df413258fc1a1a66729b7af7d39604281039 100644 --- a/mace/ops/buffer_to_image_benchmark.cc +++ b/mace/ops/buffer_to_image_benchmark.cc @@ -14,6 +14,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/testing/test_benchmark.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/ops_test_util.h" namespace mace { @@ -28,26 +29,36 @@ void FilterBufferToImage(int iters, mace::testing::StopTiming(); OpsTestNet net; + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data net.AddRandomInput("Input", {out_channel, in_channel, height, width}); + // Create output + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - OpDefBuilder("BufferToImage", "BufferToImageBM") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); + auto transform_func = [&]() { + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, + net.ws()->GetTensor("Input"), + OpenCLBufferType::IN_OUT_CHANNEL, + MemoryType::GPU_IMAGE, + 0, + b2i_output); + }; // Warm-up net.Setup(D); for (int i = 0; i < 5; ++i) { - net.Run(); + transform_func(); } net.Sync(); mace::testing::StartTiming(); while (iters--) { - net.Run(); + transform_func(); } net.Sync(); } diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index fcf7e37015cf3ff0d2db7f3c48c392ede4b452f2..e6a65aa258fa8c76328c5be88a99e04e0bb1f074 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -14,6 +14,7 @@ #include "gtest/gtest.h" #include "mace/ops/ops_test_util.h" +#include "mace/ops/opencl/buffer_transformer.h" namespace mace { namespace ops { @@ -21,31 +22,27 @@ namespace test { namespace { template -void TestBidirectionTransform(const int type, +void TestBidirectionTransform(const OpenCLBufferType type, const std::vector &input_shape) { OpsTestNet net; - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("B2IOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data net.AddRandomInput("Input", input_shape); + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - // Run - net.RunOp(D); + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, net.ws()->GetTensor("Input"), + type, MemoryType::GPU_IMAGE, 0, b2i_output); - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input("B2IOutput") - .Output("I2BOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(D); + // Inverse Transform + Tensor *i2b_output = net.ws()->CreateTensor( + "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + .Transform(&context, b2i_output, + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -54,132 +51,139 @@ void TestBidirectionTransform(const int type, } // namespace TEST(BufferToImageTest, ArgSmall) { - TestBidirectionTransform(ops::ARGUMENT, {1}); + TestBidirectionTransform(OpenCLBufferType::ARGUMENT, + {1}); } TEST(BufferToImageTest, ArgHalfSmall) { - TestBidirectionTransform(ops::ARGUMENT, {11}); + TestBidirectionTransform(OpenCLBufferType::ARGUMENT, + {11}); } TEST(BufferToImageTest, ArgMedium) { - TestBidirectionTransform(ops::ARGUMENT, {11}); + TestBidirectionTransform(OpenCLBufferType::ARGUMENT, + {11}); } TEST(BufferToImageTest, ArgLarge) { - TestBidirectionTransform(ops::ARGUMENT, {256}); + TestBidirectionTransform(OpenCLBufferType::ARGUMENT, + {256}); } TEST(BufferToImageTest, InputSmallSingleChannel) { - TestBidirectionTransform(ops::IN_OUT_CHANNEL, - {1, 2, 3, 1}); + TestBidirectionTransform( + OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 1}); } TEST(BufferToImageTest, InputSmallMultipleChannel) { - TestBidirectionTransform(ops::IN_OUT_CHANNEL, - {1, 2, 3, 3}); + TestBidirectionTransform( + OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 3}); } TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { - TestBidirectionTransform(ops::IN_OUT_CHANNEL, - {3, 2, 3, 3}); + TestBidirectionTransform( + OpenCLBufferType::IN_OUT_CHANNEL, {3, 2, 3, 3}); } TEST(BufferToImageTest, InputMedium) { - TestBidirectionTransform(ops::IN_OUT_CHANNEL, - {3, 13, 17, 128}); + TestBidirectionTransform( + OpenCLBufferType::IN_OUT_CHANNEL, {3, 13, 17, 128}); } TEST(BufferToImageTest, InputLarge) { - TestBidirectionTransform(ops::IN_OUT_CHANNEL, - {3, 64, 64, 256}); + TestBidirectionTransform( + OpenCLBufferType::IN_OUT_CHANNEL, {3, 64, 64, 256}); } TEST(BufferToImageTest, Filter1x1Small) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {5, 3, 1, 1}); } TEST(BufferToImageTest, Filter1x1Medium) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {13, 17, 1, 1}); } TEST(BufferToImageTest, Filter1x1Large) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {512, 128, 1, 1}); } TEST(BufferToImageTest, Filter3x3Small) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {3, 5, 3, 3}); } TEST(BufferToImageTest, Filter3x3Medium) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {17, 13, 3, 3}); } TEST(BufferToImageTest, Filter3x3Large) { - TestBidirectionTransform(ops::CONV2D_FILTER, + TestBidirectionTransform(CONV2D_FILTER, {256, 128, 3, 3}); } TEST(BufferToImageTest, WeightWidthSmall) { - TestBidirectionTransform(ops::WEIGHT_WIDTH, - {1, 3, 3, 3}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_WIDTH, + {1, 3, 3, 3}); } TEST(BufferToImageTest, WeightWidthMedium) { - TestBidirectionTransform(ops::WEIGHT_WIDTH, - {11, 13, 13, 17}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_WIDTH, + {11, 13, 13, 17}); } TEST(BufferToImageTest, WeightWidthLarge) { - TestBidirectionTransform(ops::WEIGHT_WIDTH, - {64, 64, 11, 13}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_WIDTH, + {64, 64, 11, 13}); } TEST(BufferToImageTest, WeightHeightSmall) { - TestBidirectionTransform(ops::WEIGHT_HEIGHT, - {2, 1, 1, 1}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_HEIGHT, + {2, 1, 1, 1}); } TEST(BufferToImageTest, WeightHeightMedium) { - TestBidirectionTransform(ops::WEIGHT_HEIGHT, - {11, 13, 13, 17}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_HEIGHT, + {11, 13, 13, 17}); } TEST(BufferToImageTest, WeightHeightLarge) { - TestBidirectionTransform(ops::WEIGHT_HEIGHT, - {64, 16, 11, 13}); + TestBidirectionTransform( + OpenCLBufferType::WEIGHT_HEIGHT, + {64, 16, 11, 13}); } namespace { template -void TestDiffTypeBidirectionTransform(const int type, +void TestDiffTypeBidirectionTransform(const OpenCLBufferType type, const std::vector &input_shape) { OpsTestNet net; - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("B2IOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data net.AddRandomInput("Input", input_shape); + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - // Run - net.RunOp(D); - - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input("B2IOutput") - .Output("I2BOutput") - .AddIntArg("buffer_type", type) - .Finalize(net.NewOperatorDef()); + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, net.ws()->GetTensor("Input"), + type, MemoryType::GPU_IMAGE, 0, b2i_output); - // Run - net.RunOp(D); + // Inverse Transform + Tensor *i2b_output = net.ws()->CreateTensor( + "I2BOutput", context.device()->allocator(), DT_FLOAT); + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + .Transform(&context, b2i_output, + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -188,40 +192,38 @@ void TestDiffTypeBidirectionTransform(const int type, } // namespace TEST(BufferToImageTest, ArgFloatToHalfSmall) { - TestDiffTypeBidirectionTransform(ops::ARGUMENT, - {11}); + TestDiffTypeBidirectionTransform( + OpenCLBufferType::ARGUMENT, + {11}); } namespace { template -void TestStringHalfBidirectionTransform(const int type, +void TestStringHalfBidirectionTransform(const OpenCLBufferType type, const std::vector &input_shape, const unsigned char *input_data) { OpsTestNet net; - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("B2IOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); + // Add input data const half *h_data = reinterpret_cast(input_data); - net.AddInputFromArray("Input", input_shape, std::vector(h_data, h_data + 2)); + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); - // Run - net.RunOp(D); - - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input("B2IOutput") - .Output("I2BOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + // Transform + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, net.ws()->GetTensor("Input"), + type, MemoryType::GPU_IMAGE, 0, b2i_output); - // Run - net.RunOp(D); + // Inverse Transform + Tensor *i2b_output = net.ws()->CreateTensor( + "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + .Transform(&context, b2i_output, + type, MemoryType::GPU_BUFFER, 0, i2b_output); // Check ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), @@ -233,8 +235,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) { const unsigned char input_data[] = { 0xCD, 0x3C, 0x33, 0x40, }; - TestStringHalfBidirectionTransform(ops::ARGUMENT, - {2}, input_data); + TestStringHalfBidirectionTransform( + OpenCLBufferType::ARGUMENT, {2}, input_data); } } // namespace test diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc index 1accbe213585ddb6d8c0058fee076fd191d87f2f..088b149d894b76439b32087134d3958da4ad0af4 100644 --- a/mace/ops/buffer_transform.cc +++ b/mace/ops/buffer_transform.cc @@ -28,34 +28,27 @@ class BufferTransformOp : public Operation { public: explicit BufferTransformOp(OpConstructContext *context) : Operation(context), - wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 2)), - out_mem_type_(MemoryType::GPU_BUFFER), - transformer_(nullptr) { - MemoryType in_mem_type = context->workspace()->GetTensor( - operator_def_->input(0))->memory_type(); - if (context->device()->opencl_runtime()->UseImageMemory()) { - out_mem_type_ = MemoryType::GPU_IMAGE; - } - transformer_.reset(new OpenCLBufferTransformer(in_mem_type, - out_mem_type_)); - } + wino_blk_size_(Operation::GetOptionalArg("wino_block_size", 0)), + out_mem_type_(static_cast(Operation::GetOptionalArg( + "mem_type", static_cast(MemoryType::GPU_IMAGE)))) {} MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); Tensor *output = this->Output(0); - ops::BufferType type = - static_cast(Operation::GetOptionalArg( - "buffer_type", static_cast(ops::CONV2D_FILTER))); + auto type = + static_cast(Operation::GetOptionalArg( + "buffer_type", static_cast(CONV2D_FILTER))); - return transformer_->Transform( - context, input, type, wino_blk_size_, out_mem_type_, output); + MemoryType in_mem_type = context->workspace()->GetTensor( + operator_def_->input(0))->memory_type(); + return OpenCLBufferTransformer(in_mem_type, out_mem_type_).Transform( + context, input, type, out_mem_type_, wino_blk_size_, output); } private: const int wino_blk_size_; MemoryType out_mem_type_; - std::unique_ptr> transformer_; }; diff --git a/mace/ops/buffer_transform_test.cc b/mace/ops/buffer_transform_test.cc index c768d671c3b7cd1da70b673054207306e75f56dc..c18e81cf99f4b8d6d1fef29ba3d95aa8873292f2 100644 --- a/mace/ops/buffer_transform_test.cc +++ b/mace/ops/buffer_transform_test.cc @@ -15,6 +15,7 @@ #include #include "gtest/gtest.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/ops_test_util.h" namespace mace { @@ -30,31 +31,31 @@ class BufferTransformTest : public OpsTestBase { namespace { template -void TestBidirectionTransform(const int type, +void TestBidirectionTransform(const OpenCLBufferType type, const std::vector &input_shape) { OpsTestNet net; - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("TransformedOutput") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data net.AddRandomInput("Input", input_shape); - - // Run - net.RunOp(DeviceType::GPU); - - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input("TransformedOutput") - .Output("Output") - .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); - - // Run - net.RunOp(DeviceType::GPU); + Tensor *bt_output = net.ws()->CreateTensor( + "BtOutput", context.device()->allocator(), + DataTypeToEnum::value); + + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) + .Transform(&context, net.ws()->GetTensor("Input"), + type, MemoryType::GPU_BUFFER, 0, bt_output); + + // Inverse Transform + Tensor *output = net.ws()->CreateTensor( + "Output", context.device()->allocator(), + DataTypeToEnum::value); + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) + .Transform(&context, bt_output, + type, MemoryType::GPU_BUFFER, 0, output); if (DataTypeToEnum::value == DataTypeToEnum::value) { EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(), @@ -69,38 +70,35 @@ void TestBidirectionTransform(const int type, } // namespace TEST_F(BufferTransformTest, FloatToHalf) { - TestBidirectionTransform(ops::BufferType::IN_OUT_CHANNEL, + TestBidirectionTransform(OpenCLBufferType::IN_OUT_CHANNEL, {1, 2, 3, 4}); } -TEST_F(BufferTransformTest, HalfToHalf) { - TestBidirectionTransform(ops::BufferType::IN_OUT_CHANNEL, - {1, 2, 3, 4}); -} - namespace { template void TestArgumentTransform(const index_t input_size) { OpsTestNet net; - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("Output") - .AddIntArg("buffer_type", ops::BufferType::ARGUMENT) - .AddIntArg("T", DataTypeToEnum::value) - .Finalize(net.NewOperatorDef()); + OpContext context(net.ws(), + OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data net.AddRandomInput("Input", {input_size}); // Run - net.RunOp(DeviceType::GPU); + Tensor *output = net.ws()->CreateTensor( + "Output", context.device()->allocator(), + DataTypeToEnum::value); + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, + MemoryType::GPU_BUFFER) + .Transform(&context, net.ws()->GetTensor("Input"), + OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER, + 0, output); - auto output_tensor = net.GetOutput("Output"); index_t expected_size = RoundUp(input_size, 4); - EXPECT_EQ(expected_size, output_tensor->buffer_shape()[0]); + EXPECT_EQ(expected_size, output->buffer_shape()[0]); // Check - ExpectTensorNear(*net.GetTensor("Input"), *output_tensor, + ExpectTensorNear(*net.GetTensor("Input"), *output, 1e-3, 1e-4); } } // namespace diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index 6707a5c950aab3c46b90094254f26ef20dde7e84..db5f8494af4d2f0bfceb1288d250572d1e15a830 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -36,23 +36,11 @@ void ChannelShuffle( MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("Softmax", "SoftmaxBM") + OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") .Input("Input") .Output("Output") + .AddIntArg("group", group) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") - .Input("InputImage") - .Output("Output") - .AddIntArg("group", group) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index ca301a1f3433daefe95480fbfba5991dd25d60b3..1afcc41f55aa6bf45ca4b10ac3180ea8d0d6188c 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -59,22 +59,15 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) { "Input", {1, 1, 2, 16}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("group", 4) .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - // Check auto expected = net.CreateTensor( {1, 1, 2, 16}, diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc index f4c7ebbefac649e87af26a1a295f5613e171a4a7..eec11e0bb132055238d0dee95091d088729799bc 100644 --- a/mace/ops/concat.cc +++ b/mace/ops/concat.cc @@ -28,7 +28,8 @@ class ConcatOpBase : public Operation { public: explicit ConcatOpBase(OpConstructContext *context) : Operation(context), - axis_(Operation::GetOptionalArg("axis", 3)) {} + axis_(Operation::GetOptionalArg("axis", 3)), + checked_(false) {} protected: void Validate() { @@ -42,6 +43,7 @@ class ConcatOpBase : public Operation { protected: int axis_; + bool checked_; }; template @@ -55,7 +57,15 @@ class ConcatOp : public ConcatOpBase { MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - Validate(); + if (!checked_) { + Validate(); + if (this->Input(0)->dim_size() == 4) { + if (axis_ == 3) axis_ = 1; + else if (axis_ == 2) axis_ = 3; + else if (axis_ == 1) axis_ = 2; + } + checked_ = true; + } const std::vector &inputs = this->Inputs(); Tensor *output = this->Output(0); const Tensor *input0 = inputs.front(); diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 02411591b558503c78508908b0a312e611eb8ca7..a43fc3084f880754612e50d75753d353d09dd04f 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -76,7 +76,7 @@ MACE_BM_CONCAT_CPU(1, 1225, 128); namespace { template -void OpenclConcatHelper(int iters, +void OpenCLConcatHelper(int iters, const std::vector &shape0, const std::vector &shape1, int concat_dim) { @@ -88,15 +88,11 @@ void OpenclConcatHelper(int iters, net.AddRandomInput("Input0", shape0); net.AddRandomInput("Input1", shape1); - BufferToImage(&net, "Input0", "InputImage0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Concat", "ConcatBM") - .Input("InputImage0") - .Input("InputImage1") + .Input("Input0") + .Input("Input1") .AddIntArg("axis", concat_dim) - .Output("OutputImage") + .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); @@ -120,7 +116,7 @@ void OpenclConcatHelper(int iters, #define MACE_BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \ static void MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) {\ std::vector shape = {N, H, W, C}; \ - OpenclConcatHelper(iters, shape, shape, 3); \ + OpenCLConcatHelper(iters, shape, shape, 3); \ } \ MACE_BENCHMARK(MACE_BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE) diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 83307e781a29eccbaa065588c9e2b554219ed2b0..a1b38898f9cf05919edf4433a7d502d3ae1626c7 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -104,7 +104,7 @@ TEST_F(ConcatOpTest, CPURandom) { static unsigned int seed = time(NULL); int dim = 5; int num_inputs = 2 + rand_r(&seed) % 10; - int axis = rand_r(&seed) % dim; + int axis = 1; // Construct graph OpsTestNet net; auto builder = OpDefBuilder("Concat", "ConcatTest"); @@ -157,7 +157,8 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { static unsigned int seed = time(NULL); int dim = 4; int num_inputs = 2 + rand_r(&seed) % 10; - int axis = rand_r(&seed) % dim; + int axis = 1; + int axis_arg = 3; // NHWC // Construct graph OpsTestNet net; @@ -178,13 +179,13 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { std::vector output_shape = input_shapes[0]; output_shape[axis] = concat_axis_size; net.AddRandomInput( - "Output", output_shape, true, true); + "Output", output_shape, false, true, true); auto builder = OpDefBuilder("Concat", "ConcatTest"); for (int i = 0; i < num_inputs; ++i) { builder = builder.Input(MakeString("Input", i)); } - builder.AddIntArg("axis", axis) + builder.AddIntArg("axis", axis_arg) .Output("Output") .Finalize(net.NewOperatorDef()); @@ -212,7 +213,7 @@ TEST_F(ConcatOpTest, QuantizedCPURandom) { net.RunOp(); net.AddRandomInput( - "QuantizedOutput", output_shape, true, true); + "QuantizedOutput", output_shape, false, true, true); auto q_builder = OpDefBuilder("Concat", "QuantizedConcatTest"); for (int i = 0; i < num_inputs; ++i) { q_builder = q_builder.Input(MakeString("QuantizedInput", i)); @@ -255,32 +256,26 @@ void OpenclRandomTest(const std::vector> &shapes, OpsTestNet net; for (int i = 0; i < num_inputs; ++i) { const std::string input_name = MakeString("Input", i); - const std::string image_name = MakeString("InputImage", i); concat_axis_size += shapes[i][axis]; GenerateRandomRealTypeData(shapes[i], &inputs[i]); input_ptrs[i] = inputs[i].data(); net.AddInputFromArray(input_name, shapes[i], inputs[i]); - BufferToImage(&net, input_name, image_name, - ops::BufferType::IN_OUT_CHANNEL); } auto builder = OpDefBuilder("Concat", "ConcatTest"); for (int i = 0; i < num_inputs; ++i) { - const std::string image_name = MakeString("InputImage", i); + const std::string image_name = MakeString("Input", i); builder = builder.Input(image_name); } builder.AddIntArg("axis", axis) - .Output("OutputImage") + .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - // Check auto output = net.GetOutput("Output"); diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index bf5ebaa0c07abd30cc7884bb1b896621d6e67e09..a5cbec7411aaa47f82717e50a71ee1cf3d4d87e6 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -959,8 +959,9 @@ class Conv2dOp : public ConvPool2dOpBase { : ConvPool2dOpBase(context), activation_(ops::StringToActivationType( Operation::GetOptionalArg("activation", - "NOOP"))), - relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)) { + "NOOP"))), + relux_max_limit_(Operation::GetOptionalArg("max_limit", 0.0f)), + wino_block_size_(Operation::GetOptionalArg("wino_block_size", 0)) { MemoryType mem_type; if (context->device()->opencl_runtime()->UseImageMemory()) { mem_type = MemoryType::GPU_IMAGE; @@ -969,13 +970,32 @@ class Conv2dOp : public ConvPool2dOpBase { mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::Conv2dKernel); } + context->set_output_mem_type(mem_type); // Transform filter tensor to target format - MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, BufferType::CONV2D_FILTER, mem_type) - == MaceStatus::MACE_SUCCESS); + if ((wino_block_size_ == 2 || wino_block_size_ == 4) && + (kernel_->CheckUseWinograd( + context->device()->opencl_runtime(), + context->workspace()->GetTensor( + operator_def_->input(1))->shape(), + std::vector(operator_def_->output_shape(0).dims().begin(), + operator_def_->output_shape(0).dims().end()), + strides_.data(), + dilations_.data(), + &wino_block_size_))) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::WINOGRAD_FILTER, mem_type, wino_block_size_) + == MaceStatus::MACE_SUCCESS); + } else { + wino_block_size_ = 0; + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + } if (operator_def_->input_size() > 2) { MACE_CHECK(TransformFilter( - context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } } @@ -987,13 +1007,14 @@ class Conv2dOp : public ConvPool2dOpBase { return kernel_->Compute(context, input, filter, bias, strides_.data(), padding_type_, paddings_, dilations_.data(), activation_, relux_max_limit_, - output); + wino_block_size_, output); } private: const ActivationType activation_; const float relux_max_limit_; std::unique_ptr kernel_; + int wino_block_size_; private: MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 96be2902a3b07d5e1a7bf0a9e25587af6de3c2cb..91efff7974df9e159f531fb4fcd104751e5ed0f4 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -49,11 +49,10 @@ void Conv2d(int iters, } net.AddRandomInput("Filter", {output_channels, channels, kernel_h, - kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); - if (D == DeviceType::CPU) { - OpDefBuilder("Conv2D", "Conv2dTest") + OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") .Input("Bias") @@ -63,26 +62,6 @@ void Conv2d(int iters, .AddIntsArg("dilations", {dilation, dilation}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {dilation, dilation}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } net.Setup(D); @@ -123,9 +102,9 @@ void Conv2d(int iters, "Input", {batch, height, width, channels}); net.GetTensor("Input")->SetScale(0.1); net.AddRandomInput( - "Filter", {output_channels, kernel_h, kernel_w, channels}); + "Filter", {output_channels, kernel_h, kernel_w, channels}, true); net.GetTensor("Filter")->SetScale(0.1); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Bias", {output_channels}, true); OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index db7f0458fb9327c8cffa06a773c102a3421aa5ea..eb21ef2c3e596ba28ce4178574dcb74db59a434f 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -31,7 +31,7 @@ class Conv2dOpTest : public OpsTestBase { namespace { template -void TestNHWCSimple3x3VALID() { +void TestNHWCSimple3x3VALID(int wino_blk_size = 0) { OpsTestNet net; // Add input data net.AddInputFromArray( @@ -40,8 +40,9 @@ void TestNHWCSimple3x3VALID() { net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); + net.AddInputFromArray("Bias", {1}, {0.1f}, true); + const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -60,34 +61,25 @@ void TestNHWCSimple3x3VALID() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .OutputShape(output_shape) .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { MACE_NOT_IMPLEMENTED; } - auto expected = net.CreateTensor({1, 1, 1, 1}, {18.1f}); + auto expected = net.CreateTensor(output_shape, {18.1f}); if (DataTypeToEnum::value == DataType::DT_FLOAT) { ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { @@ -96,7 +88,7 @@ void TestNHWCSimple3x3VALID() { } template -void TestNHWCSimple3x3SAME() { +void TestNHWCSimple3x3SAME(int wino_blk_size = 0) { OpsTestNet net; // Add input data @@ -106,8 +98,9 @@ void TestNHWCSimple3x3SAME() { net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); + net.AddInputFromArray("Bias", {1}, {0.1f}, true); + const std::vector output_shape = {1, 3, 3, 1}; if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -126,35 +119,26 @@ void TestNHWCSimple3x3SAME() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .OutputShape(output_shape) .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { MACE_NOT_IMPLEMENTED; } auto expected = net.CreateTensor( - {1, 3, 3, 1}, + output_shape, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); if (DataTypeToEnum::value == DataType::DT_FLOAT) { @@ -180,6 +164,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfSimple) { TestNHWCSimple3x3SAME(); } +TEST_F(Conv2dOpTest, OPENCLSimpleWinograd) { + TestNHWCSimple3x3SAME(4); + TestNHWCSimple3x3VALID(2); + TestNHWCSimple3x3VALID(2); + // TODO(liutuo): the precision of the last value is not normal. +// TestNHWCSimple3x3SAME(4); +} + namespace { template void TestNHWCSimple3x3WithoutBias() { @@ -192,7 +184,7 @@ void TestNHWCSimple3x3WithoutBias() { net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -212,15 +204,10 @@ void TestNHWCSimple3x3WithoutBias() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Output("Output") .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) @@ -228,9 +215,6 @@ void TestNHWCSimple3x3WithoutBias() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -265,8 +249,9 @@ void TestNHWCCombined3x3() { "Filter", {2, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, - 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}, + true); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -286,18 +271,11 @@ void TestNHWCCombined3x3() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) @@ -305,9 +283,6 @@ void TestNHWCCombined3x3() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -330,7 +305,7 @@ TEST_F(Conv2dOpTest, OPENCLStride2) { namespace { template -void TestFusedNHWCSimple3x3VALID() { +void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) { OpsTestNet net; // Add input data net.AddInputFromArray( @@ -339,8 +314,9 @@ void TestFusedNHWCSimple3x3VALID() { net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {-0.1f}); + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); + net.AddInputFromArray("Bias", {1}, {-0.1f}, true); + const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -361,39 +337,30 @@ void TestFusedNHWCSimple3x3VALID() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .OutputShape(output_shape) .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddStringArg("activation", "RELU") + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { MACE_NOT_IMPLEMENTED; } - auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor(output_shape, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } template -void TestFusedNHWCSimple3x3WithoutBias() { +void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) { OpsTestNet net; // Add input data @@ -403,7 +370,8 @@ void TestFusedNHWCSimple3x3WithoutBias() { net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true); + const std::vector output_shape = {1, 1, 1, 1}; if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -424,32 +392,26 @@ void TestFusedNHWCSimple3x3WithoutBias() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Output("Output") + .OutputShape(output_shape) .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddStringArg("activation", "RELU") + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } // Check - auto expected = net.CreateTensor({1, 1, 1, 1}, {0.0f}); + auto expected = net.CreateTensor(output_shape, {0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output")); } @@ -466,6 +428,13 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) { TestFusedNHWCSimple3x3WithoutBias(); } +TEST_F(Conv2dOpTest, FusedOPENCLSimpleWinograd) { + TestFusedNHWCSimple3x3VALID(2); + TestFusedNHWCSimple3x3WithoutBias(2); + TestFusedNHWCSimple3x3VALID(4); + TestFusedNHWCSimple3x3WithoutBias(4); +} + namespace { template void TestConv1x1() { @@ -484,8 +453,8 @@ void TestConv1x1() { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( "Filter", {2, 5, 1, 1}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}, true); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -504,27 +473,17 @@ void TestConv1x1() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -550,7 +509,8 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } namespace { template void TestComplexConvNxNS12(const std::vector &shape, - const int stride) { + const int stride, + const int wino_blk_size = 0) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type) { @@ -559,16 +519,16 @@ void TestComplexConvNxNS12(const std::vector &shape, index_t batch = 3 + (rand_r(&seed) % 10); index_t height = shape[0]; index_t width = shape[1]; - index_t input_channels = shape[2] + (rand_r(&seed) % 10); - index_t output_channels = shape[3] + (rand_r(&seed) % 10); + index_t input_channels = shape[2]; + index_t output_channels = shape[3]; OpsTestNet net; // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -595,28 +555,20 @@ void TestComplexConvNxNS12(const std::vector &shape, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("OPENCLOutput") + .OutputShape(expected->shape()) .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -633,11 +585,21 @@ TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { TestComplexConvNxNS12({32, 16, 16, 32}, 2); } +TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNWinograd) { + TestComplexConvNxNS12({32, 16, 16, 32}, 1, 2); + TestComplexConvNxNS12({32, 16, 16, 32}, 1, 4); +} + TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TestComplexConvNxNS12({17, 113, 5, 7}, 1); TestComplexConvNxNS12({17, 113, 5, 7}, 2); } +TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNWinograd) { + TestComplexConvNxNS12({17, 113, 5, 7}, 1, 4); + TestComplexConvNxNS12({17, 113, 5, 7}, 1, 2); +} + TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { TestComplexConvNxNS12({31, 113, 13, 17}, 3); TestComplexConvNxNS12({32, 32, 13, 17}, 4); @@ -647,13 +609,14 @@ namespace { template void TestHalfComplexConvNxNS12(const std::vector &input_shape, const std::vector &filter_shape, - const std::vector &dilations) { + const std::vector &dilations, + const int wino_blk_size = 0) { testing::internal::LogToStderr(); srand(time(NULL)); - auto func = [&](int stride_h, int stride_w, Padding padding) { + auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) { // generate random input - index_t batch = 1; + static unsigned int seed = time(NULL); index_t height = input_shape[0]; index_t width = input_shape[1]; index_t kernel_h = filter_shape[0]; @@ -677,8 +640,11 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, "Input", {batch, height, width, input_channels}, float_input_data); net.AddInputFromArray( "Filter", {output_channels, input_channels, kernel_h, kernel_w}, - float_filter_data); - net.AddInputFromArray("Bias", {output_channels}, float_bias_data); + float_filter_data, true); + net.AddInputFromArray("Bias", + {output_channels}, + float_bias_data, + true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -704,38 +670,31 @@ void TestHalfComplexConvNxNS12(const std::vector &input_shape, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("OPENCLOutput") + .OutputShape(expected->shape()) .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {dilations[0], dilations[1]}) .AddIntArg("T", static_cast(DataType::DT_HALF)) + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-2); }; - func(1, 1, VALID); - func(1, 1, SAME); - if (dilations[0] == 1) { - func(2, 2, VALID); - func(2, 2, SAME); + for (auto batch : {1, 5}) { + func(batch, 1, 1, VALID); + func(batch, 1, 1, SAME); + if (dilations[0] == 1 && wino_blk_size == 0) { + func(batch, 2, 2, VALID); + func(batch, 2, 2, SAME); + } } } } // namespace @@ -748,6 +707,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { TestHalfComplexConvNxNS12({32, 32}, {3, 3, 32, 64}, {1, 1}); } +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3Winograd) { + TestHalfComplexConvNxNS12({32, 32}, {3, 3, 32, 64}, + {1, 1}, 2); +// TODO(liutuo) : the precision error is large. +// TestHalfComplexConvNxNS12({32, 32}, {3, 3, 32, 64}, +// {1, 1}, 4); +} + TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) { TestHalfComplexConvNxNS12({32, 32}, {5, 5, 3, 64}, {1, 1}); TestHalfComplexConvNxNS12({32, 32}, {5, 5, 3, 63}, {1, 1}); @@ -795,6 +762,14 @@ TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { TestHalfComplexConvNxNS12({107, 113}, {3, 3, 5, 7}, {1, 1}); } +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3Winograd) { + // TODO(liutuo) : the precision error is large. +// TestHalfComplexConvNxNS12({107, 113}, {3, 3, 5, 7}, +// {1, 1}, 4); + TestHalfComplexConvNxNS12({107, 113}, {3, 3, 5, 7}, + {1, 1}, 2); +} + TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) { TestHalfComplexConvNxNS12({64, 64}, {5, 5, 16, 16}, {2, 2}); } @@ -828,8 +803,8 @@ void TestDilationConvNxN(const std::vector &shape, // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -856,18 +831,11 @@ void TestDilationConvNxN(const std::vector &shape, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("OPENCLOutput") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {dilation_rate, dilation_rate}) @@ -875,9 +843,6 @@ void TestDilationConvNxN(const std::vector &shape, .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, 1e-4); }; @@ -927,8 +892,8 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -953,18 +918,11 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("OPENCLOutput") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", dilations) @@ -973,8 +931,6 @@ void TestGeneralHalfAtrousConv(const std::vector &image_shape, // Run on device net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, 1e-1); }; @@ -996,7 +952,8 @@ TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) { namespace { template void TestArbitraryPadConvNxN(const std::vector &shape, - const std::vector &paddings) { + const std::vector &paddings, + const int wino_blk_size = 0) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) { srand(time(NULL)); @@ -1011,10 +968,11 @@ void TestArbitraryPadConvNxN(const std::vector &shape, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( - "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -1026,7 +984,6 @@ void TestArbitraryPadConvNxN(const std::vector &shape, .Output("OutputNCHW") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("padding_values", paddings) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // run on cpu @@ -1040,34 +997,35 @@ void TestArbitraryPadConvNxN(const std::vector &shape, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("OPENCLOutput") + .OutputShape(expected->shape()) .AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("padding_values", paddings) .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("wino_block_size", wino_blk_size) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, - 1e-4); + if (DataTypeToEnum::value == DT_HALF) { + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-1, + 1e-2); + } else { + ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, + 1e-4); + } }; - for (int kernel_size : {3, 5, 7}) { - for (int stride : {2, 3}) { - func(kernel_size, kernel_size, stride, stride); + if (wino_blk_size != 0) { + func(3, 3, 1, 1); + } else { + for (int kernel_size : {3, 5, 7}) { + for (int stride : {2, 3}) { + func(kernel_size, kernel_size, stride, stride); + } } } } @@ -1081,8 +1039,24 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) { TestArbitraryPadConvNxN({128, 128, 16, 16}, {2, 2}); } +TEST_F(Conv2dOpTest, OPENCLAlignedPad2Winograd) { + TestArbitraryPadConvNxN({128, 128, 16, 16}, + {2, 2}, 2); + TestArbitraryPadConvNxN({128, 128, 16, 16}, + {2, 2}, 4); +} + TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); + TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); +} + +TEST_F(Conv2dOpTest, OPENCLUnalignedPad4Winograd) { + TestArbitraryPadConvNxN({107, 113, 5, 7}, {1, 1}, 2); + TestArbitraryPadConvNxN({107, 113, 5, 7}, {1, 1}, 2); + TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}, 4); + // TODO(liutuo) : the precision error is large. + TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}, 4); } namespace { @@ -1094,13 +1068,13 @@ void TestQuantSimple3x3() { net.AddInputFromArray( "Filter", {1, 3, 3, 2}, {102, 150, 123, 135, 1, 216, 137, 47, 53, 75, 145, 130, 171, 62, 255, - 122, 72, 211}, 0.0226, 127); + 122, 72, 211}, true, 0.0226, 127); net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 75, 117, 161, 127, 119, 94, 151, 203, 151, 84, 61, 55, 142, 113, 139, - 3, 255}, 0.0204, 93); + 3, 255}, false, 0.0204, 93); - net.AddInputFromArray("Bias", {1}, {2}); + net.AddInputFromArray("Bias", {1}, {2}, true); OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") @@ -1136,12 +1110,14 @@ void TestQuant(const index_t batch, net.AddRandomInput("Input", {batch, in_height, in_width, in_channels}); net.AddRandomInput("Filter", {out_channels, k_height, k_width, - in_channels}); - net.AddRandomInput("Bias", {out_channels}); + in_channels}, true); + net.AddRandomInput("Bias", {out_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Filter", OHWI, "FilterOIHW", - OIHW); + net.TransformFilterDataFormat("Filter", + OHWI, + "FilterOIHW", + OIHW); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputNCHW") @@ -1193,7 +1169,7 @@ void TestQuant(const index_t batch, bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0, q_bias.data()); net.AddInputFromArray("QuantizedBias", - {out_channels}, q_bias); + {out_channels}, q_bias, true); OpDefBuilder("Conv2D", "QuantizeConv2dTest") .Input("QuantizedInput") .Input("QuantizedFilter") diff --git a/mace/ops/conv_pool_2d_util.cc b/mace/ops/conv_pool_2d_util.cc index a056743e85af91b562781d9821aebad87115221d..fcc44e789dbeb55f6455655420566e514f0fa1a3 100644 --- a/mace/ops/conv_pool_2d_util.cc +++ b/mace/ops/conv_pool_2d_util.cc @@ -24,7 +24,7 @@ namespace ops { void CalcPaddingAndOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const DataFormat filter_format, + const FilterDataFormat filter_format, const int *dilations, const int *strides, Padding padding, @@ -137,7 +137,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC void CalcOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const DataFormat filter_format, + const FilterDataFormat filter_format, const int *padding_size, const int *dilations, const int *strides, diff --git a/mace/ops/conv_pool_2d_util.h b/mace/ops/conv_pool_2d_util.h index 0e45c31e4be04938adc3f4e4271b8c6140106fb0..78333717e098903e46704d870d4f93a41f52b018 100644 --- a/mace/ops/conv_pool_2d_util.h +++ b/mace/ops/conv_pool_2d_util.h @@ -35,7 +35,7 @@ namespace ops { void CalcPaddingAndOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const DataFormat filter_format, + const FilterDataFormat filter_format, const int *dilations, const int *strides, Padding padding, @@ -61,7 +61,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, void CalcOutputSize(const index_t *input_shape, const DataFormat input_format, const index_t *filter_shape, - const DataFormat filter_format, + const FilterDataFormat filter_format, const int *padding_size, const int *dilations, const int *strides, diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc deleted file mode 100644 index 3e3185b34ada65c40fb03e398a2033cb020217f9..0000000000000000000000000000000000000000 --- a/mace/ops/core_test.cc +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -TEST(CoreTest, INIT_MODE) { - std::vector op_defs; - - Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU); - std::unique_ptr> tuner; - Workspace ws; - - op_defs.emplace_back(OperatorDef()); - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input("Input") - .Output("B2IOutput") - .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER) - .AddIntArg("mode", static_cast(NetMode::INIT)) - .Finalize(&op_defs[op_defs.size() - 1]); - - Tensor *input = ws.CreateTensor("Input", device->allocator(), - DataTypeToEnum::v()); - input->Resize({1, 3, 3, 3}); - { - Tensor::MappingGuard input_mapper(input); - float *input_data = input->mutable_data(); - std::fill(input_data, input_data + input->size(), 1); - } - - op_defs.emplace_back(OperatorDef()); - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input("B2IOutput") - .Output("Output") - .AddIntArg("buffer_type", ops::BufferType::CONV2D_FILTER) - .Finalize(&op_defs[op_defs.size() - 1]); - - NetDef net_def; - for (auto &op_def : op_defs) { - net_def.add_op()->CopyFrom(op_def); - } - std::shared_ptr op_registry(new OpRegistry()); - auto net = std::unique_ptr(new SerialNet( - op_registry.get(), &net_def, &ws, device, - NetMode::INIT)); - MaceStatus status = net->Init(); - MACE_CHECK(status == MaceStatus::MACE_SUCCESS); - status = net->Run(); - MACE_CHECK(status == MaceStatus::MACE_SUCCESS); - - EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr); - EXPECT_TRUE(ws.GetTensor("Output") == nullptr); - net = std::unique_ptr(new SerialNet( - op_registry.get(), &net_def, &ws, device)); - status = net->Init(); - MACE_CHECK(status == MaceStatus::MACE_SUCCESS); - status = net->Run(); - MACE_CHECK(status == MaceStatus::MACE_SUCCESS); - EXPECT_TRUE(ws.GetTensor("Output") != nullptr); - - ExpectTensorNear(*ws.GetTensor("Input"), *ws.GetTensor("Output"), - 1e-5); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc index b186cecc8437773c96a494fa8ad3066cf8027625..aad6f93d610e8ac6eed96bd0aef9bcbcbf27cdca 100644 --- a/mace/ops/crop_benchmark.cc +++ b/mace/ops/crop_benchmark.cc @@ -66,7 +66,7 @@ MACE_BM_CROP_CPU_MACRO(2, 512, 6); namespace { template -void OpenclCropHelper(int iters, +void OpenCLCropHelper(int iters, const std::vector &shape0, const std::vector &shape1, int crop_axis, @@ -79,16 +79,12 @@ void OpenclCropHelper(int iters, net.AddRandomInput("Input0", shape0); net.AddRandomInput("Input1", shape1); - BufferToImage(&net, "Input0", "InputImage0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Crop", "CropBM") - .Input("InputImage0") - .Input("InputImage1") + .Input("Input0") + .Input("Input1") .AddIntArg("axis", crop_axis) .AddIntsArg("offset", {offset}) - .Output("OutputImage") + .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); @@ -114,7 +110,7 @@ void OpenclCropHelper(int iters, _##TYPE(int iters) { \ std::vector shape0 = {N, H, W, C}; \ std::vector shape1 = {N / 2, H / 2, W / 2, C / 2}; \ - OpenclCropHelper(iters, shape0, shape1, AXIS, OFFSET); \ + OpenCLCropHelper(iters, shape0, shape1, AXIS, OFFSET); \ } \ MACE_BENCHMARK(MACE_BM_CROP_GPU_##N##_##H##_##W##_##C##_##AXIS##_##OFFSET\ ##_##TYPE) diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc index efada981b70a3316bce239d39912845484e85c5a..b757946c4c933bab9d6bf241fc589c5afa063566 100644 --- a/mace/ops/crop_test.cc +++ b/mace/ops/crop_test.cc @@ -34,14 +34,10 @@ void RunCrop(const std::vector &input_shape, net.AddRandomInput("Input1", input_shape2); if (D == GPU) { - BufferToImage(&net, "Input0", "InputImage0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Crop", "CropTest") - .Input("InputImage0") - .Input("InputImage1") - .Output("OutputImage") + .Input("Input0") + .Input("Input1") + .Output("Output") .AddIntsArg("offset", offset) .AddIntArg("axis", axis) .Finalize(net.NewOperatorDef()); @@ -66,10 +62,7 @@ void RunCrop(const std::vector &input_shape, // Run net.RunOp(D); - if (D == GPU) { - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else if (D == CPU) { + if (D == CPU) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index c9113439536746e9ce05d33e4b20feb35a075060..0b11667e39843378d7b58e86abefb15fa76fae89 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -30,6 +30,7 @@ #include "mace/ops/arm/deconv_2d_neon.h" #include "mace/utils/utils.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/deconv_2d.h" #endif // MACE_ENABLE_OPENCL @@ -358,11 +359,27 @@ class Deconv2dOp : public Deconv2dOpBase { public: explicit Deconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { + MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::Deconv2dKernel); } else { MACE_NOT_IMPLEMENTED; } + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (model_type_ == FrameworkType::CAFFE) { + if (operator_def_->input_size() >= 3) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, + OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); + } + } else if (operator_def_->input_size() >= 4) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type) + == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { const Tensor *input = this->Input(0); diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc index 175feacabcd514408a76434f9ee84f0fdbecdfe2..81be17c092ad0d6e91bbdf0514a4c0d94e641b10 100644 --- a/mace/ops/deconv_2d_benchmark.cc +++ b/mace/ops/deconv_2d_benchmark.cc @@ -47,40 +47,21 @@ static void Deconv2d(int iters, } net.AddRandomInput("Filter", {output_channels, channels, kernel_h, - kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.AddInputFromArray("OutputShape", {4}, - {batch, out_h, out_w, output_channels}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("OutputShape") - .Input("BiasImage") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("Input") - .Input("Filter") - .Input("OutputShape") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } - + {batch, out_h, out_w, output_channels}, + true); + OpDefBuilder("Deconv2D", "Deconv2dTest") + .Input("Input") + .Input("Filter") + .Input("OutputShape") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.Setup(D); // Warm-up diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc index 9aadd42c0c345da3bac268f1c645639850fafc80..1847c9432bb642175facf3a19f757ad4dd653e4b 100644 --- a/mace/ops/deconv_2d_test.cc +++ b/mace/ops/deconv_2d_test.cc @@ -41,40 +41,34 @@ void RunTestSimple(const std::vector &input_shape, ops::FrameworkType model_type) { OpsTestNet net; // Add input data - const index_t batch = input_shape[0]; const index_t out_channels = filter_shape[2]; net.AddInputFromArray("Input", input_shape, input_data); - net.AddInputFromArray("Filter", filter_shape, filter_data); - net.AddInputFromArray("Bias", {out_channels}, bias_data); - net.TransformDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.AddInputFromArray("Filter", filter_shape, filter_data, true); + net.AddInputFromArray("Bias", {out_channels}, bias_data, true); + // TODO(liutuo): remove the unused transform + net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "FilterOIHW", "FilterImage", - ops::BufferType::CONV2D_FILTER); if (model_type == ops::FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("FilterOIHW") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("padding_values", padding_size) .AddIntArg("framework_type", model_type) .Finalize(net.NewOperatorDef()); } else { - net.AddInputFromArray("OutputShape", {4}, output_shape); + net.AddInputFromArray("OutputShape", {4}, output_shape, true); OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") + .Input("Input") + .Input("FilterOIHW") .Input("OutputShape") - .Input("BiasImage") - .Output("OutputImage") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("padding_values", padding_size) @@ -82,10 +76,6 @@ void RunTestSimple(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); } net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -102,7 +92,7 @@ void RunTestSimple(const std::vector &input_shape, .AddIntArg("framework_type", model_type) .Finalize(net.NewOperatorDef()); } else { - net.AddInputFromArray("OutputShape", {4}, output_shape); + net.AddInputFromArray("OutputShape", {4}, output_shape, true); OpDefBuilder("Deconv2D", "Deconv2dTest") .Input("InputNCHW") @@ -387,8 +377,8 @@ void TestComplexDeconvNxN(const int batch, // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); + "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {output_channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); int out_h = 0; @@ -413,7 +403,7 @@ void TestComplexDeconvNxN(const int batch, output_shape.push_back(out_h); output_shape.push_back(out_w); output_shape.push_back(output_channels); - net.AddInputFromArray("OutputShape", {4}, output_shape); + net.AddInputFromArray("OutputShape", {4}, output_shape, true); } else { paddings.push_back(padding); paddings.push_back(padding); @@ -455,19 +445,12 @@ void TestComplexDeconvNxN(const int batch, expected->Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - if (model_type == ops::FrameworkType::CAFFE) { OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntsArg("padding_values", paddings) .AddIntArg("framework_type", model_type) @@ -475,11 +458,11 @@ void TestComplexDeconvNxN(const int batch, .Finalize(net.NewOperatorDef()); } else { OpDefBuilder("Deconv2D", "Deconv2dTest") - .Input("InputImage") - .Input("FilterImage") + .Input("Input") + .Input("Filter") .Input("OutputShape") - .Input("BiasImage") - .Output("OutputImage") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntArg("framework_type", model_type) @@ -489,9 +472,7 @@ void TestComplexDeconvNxN(const int batch, // Run on device net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-4, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-4, 1e-4); }; diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc index 822bf8f018793c3d12e3ccd7820cb8d9196d044d..c9c6dd4016b97869289388ecbfbe200347846269 100644 --- a/mace/ops/depth_to_space_benchmark.cc +++ b/mace/ops/depth_to_space_benchmark.cc @@ -36,23 +36,12 @@ void DepthToSpace( MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("DepthToSpace", "DepthToSpaceBM") + OpDefBuilder("DepthToSpace", "DepthToSpaceBM") .Input("Input") .Output("Output") + .AddIntArg("block_size", block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("DepthToSpace", "DepthToSpaceBM") - .Input("InputImage") - .Output("Output") - .AddIntArg("block_size", block_size) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc index aa9b9c28d83020c0acdcf85e92f5647e4c84d678..c369bd67f4d034ba9a9e9468be73459ec002f19a 100644 --- a/mace/ops/depth_to_space_test.cc +++ b/mace/ops/depth_to_space_test.cc @@ -45,21 +45,15 @@ void RunDepthToSpace(const std::vector &input_shape, "Output", NHWC); } else { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("DepthToSpace", "DepthToSpaceTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("block_size", block_size) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } - if (D == DeviceType::GPU) { - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -134,28 +128,23 @@ void RandomTest(const int block_size, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("DepthToSpace", "DepthToSpaceTest") - .Input("InputImg") + .Input("Input") .AddIntArg("block_size", block_size) .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("OutputImg") + .Output("GPUOutput") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*net.GetTensor("Output"), - *net.GetOutput("OPENCLOutput"), 1e-5); + *net.GetOutput("GPUOutput"), 1e-5); } else { ExpectTensorNear(*net.GetTensor("Output"), - *net.GetOutput("OPENCLOutput"), 1e-3, 1e-4); + *net.GetOutput("GPUOutput"), 1e-3, 1e-4); } } } // namespace diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 47f45e4a35277726aefe948a7fa5079b0616c2c2..8a85ab464ca0911b95a3ea4f039e1c61eb60da17 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -499,13 +499,17 @@ class DepthwiseConv2dOp : public DepthwiseConv2dOpBase { mem_type = MemoryType::GPU_BUFFER; kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel); } + context->set_output_mem_type(mem_type); // Transform filter tensor to target format MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, BufferType::DW_CONV2D_FILTER, mem_type) - == MaceStatus::MACE_SUCCESS); + context, + operator_def_.get(), + 1, + OpenCLBufferType::DW_CONV2D_FILTER, + mem_type) == MaceStatus::MACE_SUCCESS); if (operator_def_->input_size() > 2) { MACE_CHECK(TransformFilter( - context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } } diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index 54f3e8b739305e748dbf38fb76fd41c6c72c4fb1..4d44a9bc136b59fc5e29dd93343638f65b58db88 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -57,18 +57,17 @@ void DepthwiseConv2d(int iters, } if (DataTypeToEnum::value != DT_UINT8) { net.AddRandomInput( - "Filter", {multiplier, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {input_channels * multiplier}); + "Filter", {multiplier, input_channels, kernel_h, kernel_w}, true); + net.AddRandomInput("Bias", {input_channels * multiplier}, true); } else { net.AddRandomInput( - "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + "Filter", {kernel_h, kernel_w, input_channels, multiplier}, true); net.GetTensor("Filter")->SetScale(0.1); net.AddRandomInput( - "Bias", {input_channels * multiplier}); + "Bias", {input_channels * multiplier}, true); } - if (D == DeviceType::CPU) { - OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") .Input("Input") .Input("Filter") .Input("Bias") @@ -78,26 +77,6 @@ void DepthwiseConv2d(int iters, .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } net.Setup(D); diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index dfb76b44dea241f9aa44fa0e9a1f5c3f5e088d3c..d757bf097b1b18720f5e79ad053c911fa0a6d609 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -33,8 +33,11 @@ void SimpleValidTest() { "Input", {1, 3, 3, 2}, {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); net.AddInputFromArray( - "Filter", {1, 2, 2, 2}, {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f}); - net.AddInputFromArray("Bias", {2}, {.1f, .2f}); + "Filter", + {1, 2, 2, 2}, + {1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 4.0f, 6.0f, 8.0f}, + true); + net.AddInputFromArray("Bias", {2}, {.1f, .2f}, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -52,17 +55,11 @@ void SimpleValidTest() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) @@ -70,11 +67,6 @@ void SimpleValidTest() { .Finalize(net.NewOperatorDef()); net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { MACE_NOT_IMPLEMENTED; } @@ -126,10 +118,13 @@ void ComplexValidTest(index_t batch, GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, &filter_data); net.AddInputFromArray( - "Filter", {multiplier, channel, kernel, kernel}, filter_data); + "Filter", {multiplier, channel, kernel, kernel}, filter_data, true); std::vector bias_data(channel * multiplier); GenerateRandomRealTypeData({channel * multiplier}, &bias_data); - net.AddInputFromArray("Bias", {channel * multiplier}, bias_data); + net.AddInputFromArray("Bias", + {channel * multiplier}, + bias_data, + true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -149,17 +144,11 @@ void ComplexValidTest(index_t batch, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) @@ -167,11 +156,6 @@ void ComplexValidTest(index_t batch, .Finalize(net.NewOperatorDef()); net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else { MACE_NOT_IMPLEMENTED; } @@ -182,7 +166,7 @@ void ComplexValidTest(index_t batch, index_t pad_top = ((out_height - 1) * stride + kernel - height) >> 1; index_t pad_left = ((out_width - 1) * stride + kernel - width) >> 1; index_t out_channels = channel * multiplier; - std::vector expect(batch * out_height * out_width * out_channels); + std::vector expect(batch * out_height * out_width * out_channels); for (index_t b = 0; b < batch; ++b) { for (index_t h = 0; h < out_height; ++h) { for (index_t w = 0; w < out_width; ++w) { @@ -212,12 +196,12 @@ void ComplexValidTest(index_t batch, } auto expected = - net.CreateTensor({1, out_height, out_width, out_channels}, expect); + net.CreateTensor({1, out_height, out_width, out_channels}, expect); if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } } } // namespace @@ -260,9 +244,10 @@ void TestNxNS12(const index_t height, const index_t width) { net.AddRandomInput( "Input", {batch, height, width, channel}); net.AddRandomInput( - "Filter", {multiplier, channel, kernel_h, kernel_w}); + "Filter", {multiplier, channel, kernel_h, kernel_w}, true); net.AddRandomInput("Bias", - {multiplier * channel}); + {multiplier * channel}, + true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -289,17 +274,11 @@ void TestNxNS12(const index_t height, const index_t width) { auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) @@ -309,17 +288,12 @@ void TestNxNS12(const index_t height, const index_t width) { .Finalize(net.NewOperatorDef()); net.RunOp(DeviceType::GPU); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "DeviceOutput", - ops::BufferType::IN_OUT_CHANNEL); - // Check if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-4); } else { - ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } }; @@ -366,12 +340,12 @@ void QuantSimpleValidTest() { net.AddInputFromArray( "Input", {1, 3, 3, 2}, {31, 98, 1, 54, 197, 172, 70, 146, 255, 71, 24, 182, 28, 78, 85, 96, 180, - 59}, 0.00735299, 86); + 59}, false, 0.00735299, 86); net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {212, 239, 110, 170, 216, 91, 162, 161, 255, 2, 10, 120, 183, 101, 100, - 33, 137, 51}, 0.0137587, 120); - net.AddInputFromArray("Bias", {2}, {2, 2}); + 33, 137, 51}, true, 0.0137587, 120); + net.AddInputFromArray("Bias", {2}, {2, 2}, true); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") .Input("Filter") @@ -408,13 +382,13 @@ void TestQuant(const index_t batch, OpsTestNet net; const index_t out_channels = multiplier * in_channels; net.AddRandomInput( - "Input", {batch, in_height, in_width, in_channels}, false); + "Input", {batch, in_height, in_width, in_channels}, false, false); net.AddRandomInput( - "Filter", {k_height, k_width, in_channels, multiplier}, false); - net.AddRandomInput("Bias", {out_channels}); + "Filter", {k_height, k_width, in_channels, multiplier}, true, false); + net.AddRandomInput("Bias", {out_channels}, true); net.TransformDataFormat( "Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat( + net.TransformFilterDataFormat( "Filter", HWIO, "FilterOIHW", OIHW); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") @@ -467,7 +441,7 @@ void TestQuant(const index_t batch, bias_data, bias->size(), q_input->scale() * q_filter->scale(), 0, q_bias.data()); net.AddInputFromArray( - "QuantizedBias", {out_channels}, q_bias); + "QuantizedBias", {out_channels}, q_bias, true); OpDefBuilder("DepthwiseConv2d", "QuantizedDepthwiseConv2DTest") .Input("QuantizedInput") .Input("QuantizedFilter") diff --git a/mace/ops/depthwise_deconv2d.cc b/mace/ops/depthwise_deconv2d.cc index e3dcb1e00bcdb24208fd2e501c9a12439677f8cb..3f10a514cec8712b583b1f0fcae2166fe747da46 100644 --- a/mace/ops/depthwise_deconv2d.cc +++ b/mace/ops/depthwise_deconv2d.cc @@ -29,6 +29,7 @@ #include "mace/utils/utils.h" #include "mace/public/mace.h" #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/depthwise_deconv2d.h" #endif // MACE_ENABLE_OPENCL @@ -408,11 +409,21 @@ class DepthwiseDeconv2dOp : public Deconv2dOpBase { public: explicit DepthwiseDeconv2dOp(OpConstructContext *context) : Deconv2dOpBase(context) { + MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::DepthwiseDeconv2dKernel); } else { MACE_NOT_IMPLEMENTED; } + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 1, + OpenCLBufferType::DW_CONV2D_FILTER, mem_type) + == MaceStatus::MACE_SUCCESS); + if (operator_def_->input_size() >= 3) { + MACE_CHECK(TransformFilter( + context, operator_def_.get(), 2, + OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/depthwise_deconv2d_benchmark.cc b/mace/ops/depthwise_deconv2d_benchmark.cc index 3e3da26fe31bda1f5c3873bbf5f143309bd0247e..081e10d27ce6748d397f635d53b9f74673a15c20 100644 --- a/mace/ops/depthwise_deconv2d_benchmark.cc +++ b/mace/ops/depthwise_deconv2d_benchmark.cc @@ -44,32 +44,16 @@ static void DepthwiseDeconv2d(int iters, } net.AddRandomInput("Filter", {1, channels, kernel_h, - kernel_w}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntsArg("padding_values", {padding, padding}) - .AddIntArg("group", channels) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") - .Input("Input") - .Input("Filter") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntsArg("padding_values", {padding, padding}) - .AddIntArg("group", channels) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } + kernel_w}, true); + OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("group", channels) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.Setup(D); diff --git a/mace/ops/depthwise_deconv2d_test.cc b/mace/ops/depthwise_deconv2d_test.cc index b1f36845c7d351bfad879be4b0895071b9842a68..fe3b0b18a06c076d87e130b2dc1b17f1599577b1 100644 --- a/mace/ops/depthwise_deconv2d_test.cc +++ b/mace/ops/depthwise_deconv2d_test.cc @@ -38,33 +38,23 @@ void RunTestSimple(const int group, OpsTestNet net; // Add input data net.AddInputFromArray("Input", input_shape, input_data); - net.AddInputFromArray("Filter", filter_shape, filter_data); - net.TransformDataFormat("Filter", HWOI, "FilterOIHW", OIHW); + net.AddInputFromArray("Filter", filter_shape, filter_data, true); + net.TransformFilterDataFormat("Filter", HWOI, "FilterOIHW", OIHW); const index_t out_channels = expected_shape[3]; - net.AddInputFromArray("Bias", {out_channels}, bias_data); + net.AddInputFromArray("Bias", {out_channels}, bias_data, true); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "FilterOIHW", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("FilterOIHW") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride, stride}) .AddIntArg("group", group) .AddIntsArg("padding_values", paddings) .Finalize(net.NewOperatorDef()); net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -161,22 +151,22 @@ TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Depthwise) { } TEST_F(DepthwiseDeconv2dOpTest, CPUSimple3X3Group) { -TestNHWCSimple3x3_Group(); + TestNHWCSimple3x3_Group(); } TEST_F(DepthwiseDeconv2dOpTest, GPUSimple3X3Depthwise) { -TestNHWCSimple3x3_DW(); + TestNHWCSimple3x3_DW(); } namespace { template void RandomTest(index_t batch, - index_t channel, - index_t height, - index_t width, - index_t kernel, - int stride, - int padding) { + index_t channel, + index_t height, + index_t width, + index_t kernel, + int stride, + int padding) { testing::internal::LogToStderr(); // Construct graph OpsTestNet net; @@ -195,12 +185,12 @@ void RandomTest(index_t batch, GenerateRandomRealTypeData({multiplier, channel, kernel, kernel}, &filter_data); net.AddInputFromArray( - "Filter", {multiplier, channel, kernel, kernel}, filter_data); + "Filter", {multiplier, channel, kernel, kernel}, filter_data, true); std::vector bias_data(channel * multiplier); GenerateRandomRealTypeData({channel * multiplier}, &bias_data); net.AddInputFromArray("Bias", {channel * multiplier}, - bias_data); + bias_data, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -226,17 +216,11 @@ void RandomTest(index_t batch, expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::DW_CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); OpDefBuilder("DepthwiseDeconv2d", "DepthwiseDeconv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") .AddIntsArg("strides", {stride, stride}) .AddIntsArg("padding_values", {padding, padding}) .AddIntArg("group", channel) @@ -245,14 +229,10 @@ void RandomTest(index_t batch, net.RunOp(DeviceType::GPU); - // Transfer output - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } } diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 96384cde4fac981064952a7cc7f916e671b63ab6..863b69edc2033e54866f5935b097d4f93c968395 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -1097,13 +1097,16 @@ class EltwiseOp : public Operation { } // Transform filters int input_size = operator_def_->input_size(); + Workspace *ws = context->workspace(); for (int i = 0; i < input_size; ++i) { - const Tensor *input_tensor = context->workspace()->GetTensor( - operator_def_->input(i)); - if (input_tensor != nullptr && input_tensor->is_weight()) { + if (ws->HasTensor(operator_def_->input(i)) && + ws->GetTensor(operator_def_->input(i))->is_weight()) { MACE_CHECK(TransformFilter( - context, operator_def_.get(), i, BufferType::ARGUMENT, mem_type) - == MaceStatus::MACE_SUCCESS); + context, + operator_def_.get(), + i, + OpenCLBufferType::ARGUMENT, + mem_type) == MaceStatus::MACE_SUCCESS); } } } diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc index 82fbc63f25b3b587d1d839d7e4c69d5090038a89..95808bc336a46231d920a7c409e846b89725e2ed 100644 --- a/mace/ops/eltwise_benchmark.cc +++ b/mace/ops/eltwise_benchmark.cc @@ -30,37 +30,23 @@ void EltwiseBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("Input0", {n, h, w, c}); - net.AddRandomInput("Input1", {n, h, w, c}); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input0", "InputImg0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("Eltwise", "EltwiseTest") - .Input("InputImg0") - .Input("InputImg1") - .AddIntArg("type", static_cast(type)) - .AddFloatsArg("coeff", {1.2, 2.1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("OutputImg") - .Finalize(net.NewOperatorDef()); + net.AddRandomInput("Input0", {n, h, w, c}); + net.AddRandomInput("Input1", {n, h, w, c}); } else { - net.TransformDataFormat("Input0", NHWC, - "TInput0", NCHW); - net.TransformDataFormat("Input1", NHWC, - "TInput1", NCHW); - OpDefBuilder("Eltwise", "EltwiseTest") - .Input("TInput0") - .Input("TInput1") - .AddIntArg("type", static_cast(type)) - .AddFloatsArg("coeff", {1.2, 2.1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("Output") - .Finalize(net.NewOperatorDef()); + net.AddRandomInput("Input0", {n, c, h, w}); + net.AddRandomInput("Input1", {n, c, h, w}); } + OpDefBuilder("Eltwise", "EltwiseTest") + .Input("Input0") + .Input("Input1") + .AddIntArg("type", static_cast(type)) + .AddFloatsArg("coeff", {1.2, 2.1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index ac920ac00f1150ca3336e78e4087bd2ec0ce545a..a6d7ea21d313ca9a979f3473285daa91d25b0c08 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -82,20 +82,15 @@ void SimpleTensorScalar(const ops::EltwiseType type, net.RunOp(D); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); } else { - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") - .Input("InputImg") + .Input("Input") .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", x) - .Output("OutputImg") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImg", "Output", - ops::BufferType::IN_OUT_CHANNEL); } auto expected = net.CreateTensor(shape, output); @@ -145,23 +140,16 @@ void SimpleTensorEltwise(const ops::EltwiseType type, net.RunOp(D); net.TransformDataFormat("TOutput", NCHW, "Output", NHWC); } else { - BufferToImage(&net, "Input0", "InputImg0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") - .Input("InputImg0") - .Input("InputImg1") + .Input("Input0") + .Input("Input1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .Output("OutputImg") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImg", "Output", - ops::BufferType::IN_OUT_CHANNEL); } std::vector output_shape = shape0; @@ -204,26 +192,19 @@ void TensorGeneralBroadcastEltwise(const ops::EltwiseType type, // Run net.RunOp(D); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input0", "InputImage0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", - ops::BufferType::IN_OUT_CHANNEL); auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest") .AddIntArg("T", DataTypeToEnum::v()) - .Input("InputImage0") - .Input("InputImage1") + .Input("Input0") + .Input("Input1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) .OutputType({ops::IsLogicalType(type) ? DT_INT32 : DT_FLOAT}) - .Output("OutputImage"); + .Output("Output"); op_builder.Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -483,7 +464,7 @@ void RandomTensorScalar(const ops::EltwiseType type, OpsTestNet net; // Add input data - net.AddRandomInput("Input", shape, true, true); + net.AddRandomInput("Input", shape, false, true, true); net.TransformDataFormat("Input", NHWC, "TInput", NCHW); @@ -501,26 +482,21 @@ void RandomTensorScalar(const ops::EltwiseType type, auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") - .Input("InputImg") + .Input("Input") .AddIntArg("type", static_cast(type)) .AddFloatArg("scalar_input", 0.1) - .Output("OutputImg") + .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImg", "GPUOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { - ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } } @@ -533,8 +509,16 @@ void RandomTensorEltwise(const ops::EltwiseType type, OpsTestNet net; // Add input data - net.AddRandomInput("Input0", shape0, true, true); - net.AddRandomInput("Input1", shape1, true, true); + net.AddRandomInput("Input0", + shape0, + false, + true, + true); + net.AddRandomInput("Input1", + shape1, + false, + true, + true); net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); @@ -556,29 +540,22 @@ void RandomTensorEltwise(const ops::EltwiseType type, auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input0", "InputImg0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Eltwise", "EltwiseTest") - .Input("InputImg0") - .Input("InputImg1") + .Input("Input0") + .Input("Input1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("OutputImg") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImg", "GPUOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DT_FLOAT) { - ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { - ExpectTensorNear(*expected, *net.GetOutput("GPUOutput"), 1e-2, 1e-2); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } } @@ -587,8 +564,16 @@ void QuantizedSum(const std::vector &shape) { OpsTestNet net; // Add input data - net.AddRandomInput("Input0", shape, true, true); - net.AddRandomInput("Input1", shape, true, true); + net.AddRandomInput("Input0", + shape, + false, + true, + true); + net.AddRandomInput("Input1", + shape, + false, + true, + true); net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 35e69cd159942d328ab46d9ef54fb98acef66e4e..e645eb4e8b7fb665e821a6affc07433953702421 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -45,8 +45,8 @@ void Simple() { std::vector scale(1); std::vector offset(1); CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, &scale, &offset); - net.AddInputFromArray("Scale", {1}, scale); - net.AddInputFromArray("Offset", {1}, offset); + net.AddInputFromArray("Scale", {1}, scale, true); + net.AddInputFromArray("Offset", {1}, offset, true); if (D == DeviceType::CPU) { net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -60,25 +60,14 @@ void Simple() { net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "FoldedBatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Output("OutputImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } // Check @@ -108,8 +97,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -132,27 +121,16 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "FoldedBatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Output("OutputImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run on opencl net.RunOp(DeviceType::GPU); - net.Sync(); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-4); } @@ -170,8 +148,8 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -194,18 +172,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "FoldedBatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Output("OutputImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Output("Output") .AddIntArg("T", static_cast(DataType::DT_HALF)) .Finalize(net.NewOperatorDef()); @@ -213,9 +184,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { net.RunOp(DeviceType::GPU); net.Sync(); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } @@ -233,8 +202,8 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { // Add input data net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Scale", {channels}); - net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Scale", {channels}, true); + net.AddRandomInput("Offset", {channels}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -257,26 +226,17 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "FoldedBatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Output("OutputImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run on opencl net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-4); } @@ -318,27 +278,18 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Scale", "ScaleImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "Offset", "OffsetImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("BatchNorm", "FoldedBatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Output("OutputImage") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Output("Output") .AddIntArg("T", static_cast(DataType::DT_HALF)) .Finalize(net.NewOperatorDef()); // Run on opencl net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-2); } diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index cea80bf51e998300b53e5c8729a66aa82147fc0b..ef919d9292bab8b2474a40ab30053b587bd79d96 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -202,11 +202,14 @@ class FullyConnectedOp : public FullyConnectedOpBase { } // Transform filter tensor to target format MACE_CHECK(TransformFilter( - context, operator_def_.get(), 1, BufferType::WEIGHT_WIDTH, mem_type) - == MaceStatus::MACE_SUCCESS); + context, + operator_def_.get(), + 1, + OpenCLBufferType::WEIGHT_WIDTH, + mem_type) == MaceStatus::MACE_SUCCESS); if (operator_def_->input_size() > 2) { MACE_CHECK(TransformFilter( - context, operator_def_.get(), 2, BufferType::ARGUMENT, mem_type) + context, operator_def_.get(), 2, OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS); } } diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index 6b75e60dd93648045af1719947708735ab1226c3..bb27c97dcdf2197c6f1e60ef59589b4d7a39b429 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -30,42 +30,25 @@ void FCBenchmark( OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channel}); + if (D == DeviceType::GPU) { + net.AddRandomInput("Input", {batch, height, width, channel}); + } else { + net.AddRandomInput("Input", {batch, channel, height, width}); + } + net.AddRandomInput("Weight", - {out_channel, channel, height, width}); - net.AddRandomInput("Bias", {out_channel}); - - if (D == DeviceType::CPU) { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("InputNCHW") + {out_channel, channel, height, width}, true); + net.AddRandomInput("Bias", {out_channel}, true); + + OpenCLBufferType weight_type = OpenCLBufferType::WEIGHT_WIDTH; + OpDefBuilder("FullyConnected", "FullyConnectedTest") + .Input("Input") .Input("Weight") .Input("Bias") .Output("Output") + .AddIntArg("weight_type", static_cast(weight_type)) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - ops::BufferType weight_type = ops::BufferType::WEIGHT_WIDTH; - BufferToImage(&net, "Weight", "WeightImage", - weight_type); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("InputImage") - .Input("WeightImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntArg("weight_type", static_cast(weight_type)) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index 0fd98848d323198dfdf108b7f25ad51667dd6ada..26134bb5e140e4b01aecb3c87a63dcb95bcf6aff 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -36,8 +36,8 @@ void Simple(const std::vector &input_shape, // Add input data net.AddInputFromArray("Input", input_shape, input_value); - net.AddInputFromArray("Weight", weight_shape, weight_value); - net.AddInputFromArray("Bias", bias_shape, bias_value); + net.AddInputFromArray("Weight", weight_shape, weight_value, true); + net.AddInputFromArray("Bias", bias_shape, bias_value, true); if (D == DeviceType::CPU) { OpDefBuilder("FullyConnected", "FullyConnectedTest") @@ -50,25 +50,14 @@ void Simple(const std::vector &input_shape, net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", - ops::BufferType::WEIGHT_WIDTH); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("InputImage") - .Input("WeightImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Weight") + .Input("Bias") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { MACE_NOT_IMPLEMENTED; } @@ -136,8 +125,8 @@ void Random(const index_t batch, net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput( - "Weight", {out_channel, channels, height, width}); - net.AddRandomInput("Bias", {out_channel}); + "Weight", {out_channel, channels, height, width}, true); + net.AddRandomInput("Bias", {out_channel}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -158,31 +147,22 @@ void Random(const index_t batch, expected->Copy(*net.GetOutput("Output")); // Run on opencl - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", - ops::BufferType::WEIGHT_WIDTH); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - OpDefBuilder("FullyConnected", "FullyConnectedTest") - .Input("InputImage") - .Input("WeightImage") - .Input("BiasImage") - .Output("OutputImage") + .Input("Input") + .Input("Weight") + .Input("Bias") + .Output("Output") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-1, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-1, 1e-1); } else { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2, 1e-3); } } @@ -228,10 +208,10 @@ void QuantRandom(const index_t batch, net.AddRandomInput( "Input", {batch, height, width, channels}); net.AddRandomInput( - "Weight", {out_channel, height, width, channels}); - net.AddRandomInput("Bias", {out_channel}); + "Weight", {out_channel, height, width, channels}, true); + net.AddRandomInput("Bias", {out_channel}, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); - net.TransformDataFormat("Weight", OHWI, "WeightOIHW", OIHW); + net.TransformFilterDataFormat("Weight", OHWI, "WeightOIHW", OIHW); OpDefBuilder("FullyConnected", "FullyConnectedTest") .Input("InputNCHW") diff --git a/mace/ops/lstm_cell.cc b/mace/ops/lstm_cell.cc index 19abafe098abbbe97fb93e46c13c22af30f6bb0d..dfbfa155a31377dbbbd20cbd7d6c6ebe5df48838 100644 --- a/mace/ops/lstm_cell.cc +++ b/mace/ops/lstm_cell.cc @@ -16,6 +16,7 @@ #include #include "mace/core/operator.h" +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/lstm_cell.h" namespace mace { @@ -30,13 +31,43 @@ class LSTMCellOp : public Operation { explicit LSTMCellOp(OpConstructContext *context) : Operation(context) { T forget_bias = static_cast( - Operation::GetOptionalArg("scalar_input", - 0.0)); + Operation::GetOptionalArg("scalar_input", + 0.0)); + MemoryType mem_type = MemoryType::GPU_IMAGE; if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::LSTMCellKernel(forget_bias)); } else { MACE_NOT_IMPLEMENTED; } + // Transform filters + const Tensor *pre_output = context->workspace()->GetTensor( + operator_def_->input(1)); + if (pre_output->is_weight()) { + MACE_CHECK(TransformFilter(context, + operator_def_.get(), + 1, + OpenCLBufferType::IN_OUT_CHANNEL, + mem_type) == MaceStatus::MACE_SUCCESS); + } + MACE_CHECK(TransformFilter(context, + operator_def_.get(), + 2, + OpenCLBufferType::IN_OUT_CHANNEL, + mem_type) == MaceStatus::MACE_SUCCESS); + MACE_CHECK(TransformFilter(context, + operator_def_.get(), + 3, + OpenCLBufferType::ARGUMENT, + mem_type) == MaceStatus::MACE_SUCCESS); + const Tensor *pre_cell = context->workspace()->GetTensor( + operator_def_->input(4)); + if (pre_cell->is_weight()) { + MACE_CHECK(TransformFilter(context, + operator_def_.get(), + 4, + OpenCLBufferType::IN_OUT_CHANNEL, + mem_type) == MaceStatus::MACE_SUCCESS); + } } MaceStatus Run(OpContext *context) override { diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc index b8840bba42a1617380a764bae1431ac1e78d24fd..6568025a1a169ed856cf3df8704f635bb9824b2b 100644 --- a/mace/ops/lstmcell_benchmark.cc +++ b/mace/ops/lstmcell_benchmark.cc @@ -29,11 +29,11 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) { // Add input data net.AddRandomInput("Input", {batch, input_size}); - net.AddRandomInput("PreOutput", {batch, hidden_units}); + net.AddRandomInput("PreOutput", {batch, hidden_units}, true); net.AddRandomInput("Weight", {input_size + hidden_units, - 4 * hidden_units}); - net.AddRandomInput("Bias", {4 * hidden_units}); - net.AddRandomInput("PreCell", {batch, hidden_units}); + 4 * hidden_units}, true); + net.AddRandomInput("Bias", {4 * hidden_units}, true); + net.AddRandomInput("PreCell", {batch, hidden_units}, true); const float &forget_add = 0.0f; @@ -45,28 +45,17 @@ void LSTMCell(int iters, int batch, int input_size, int hidden_units) { net.CopyData("PreCell", "PreCellCPU"); LSTMCellCPU(&net, "InputCPU", "PreOutputCPU", "WeightCPU", "BiasCPU", - "PreCellCPU", forget_add, "CellCPU", "OutputCPU"); + "PreCellCPU", forget_add, "CellCPU", "OutputCPU"); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "PreOutput", "PreOutputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "PreCell", "PreCellImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("LSTMCell", "LSTMCellTest") - .Input("InputImage") - .Input("PreOutputImage") - .Input("WeightImage") - .Input("BiasImage") - .Input("PreCellImage") + .Input("Input") + .Input("PreOutput") + .Input("Weight") + .Input("Bias") + .Input("PreCell") .AddFloatArg("scalar_input", forget_add) - .Output("CellImage") - .Output("OutputImage") + .Output("Cell") + .Output("Output") .Finalize(net.NewOperatorDef()); } else { MACE_NOT_IMPLEMENTED; diff --git a/mace/ops/lstmcell_test.cc b/mace/ops/lstmcell_test.cc index 99dea59c10827832d21cdc699c6105446fc4fc7b..2d1affeeb41840d4d25d58041bb77685f60c1066 100644 --- a/mace/ops/lstmcell_test.cc +++ b/mace/ops/lstmcell_test.cc @@ -32,11 +32,11 @@ void TestLSTMCell(const uint32_t &batch, OpsTestNet net; net.AddRandomInput("Input", {batch, input_size}); - net.AddRandomInput("PreOutput", {batch, hidden_units}); + net.AddRandomInput("PreOutput", {batch, hidden_units}, true); net.AddRandomInput("Weight", {input_size + hidden_units, - 4 * hidden_units}); - net.AddRandomInput("Bias", {4 * hidden_units}); - net.AddRandomInput("PreCell", {batch, hidden_units}); + 4 * hidden_units}, true); + net.AddRandomInput("Bias", {4 * hidden_units}, true); + net.AddRandomInput("PreCell", {batch, hidden_units}, true); net.CopyData("Input", "InputCPU"); net.CopyData("PreOutput", "PreOutputCPU"); @@ -46,42 +46,25 @@ void TestLSTMCell(const uint32_t &batch, // Run on CPU LSTMCellCPU(&net, "InputCPU", "PreOutputCPU", "WeightCPU", "BiasCPU", - "PreCellCPU", forget_add, "CellCPU", "OutputCPU"); + "PreCellCPU", forget_add, "CellCPU", "OutputCPU"); // Run net.RunOp(DeviceType::CPU); // Run on GPU - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "PreOutput", "PreOutputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Weight", "WeightImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Bias", "BiasImage", - ops::BufferType::ARGUMENT); - BufferToImage(&net, "PreCell", "PreCellImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("LSTMCell", "LSTMCellTest") - .Input("InputImage") - .Input("PreOutputImage") - .Input("WeightImage") - .Input("BiasImage") - .Input("PreCellImage") + .Input("Input") + .Input("PreOutput") + .Input("Weight") + .Input("Bias") + .Input("PreCell") .AddFloatArg("scalar_input", forget_add) - .Output("CellImage") - .Output("OutputImage") + .Output("Cell") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - ImageToBuffer(&net, "CellImage", "Cell", - ops::BufferType::IN_OUT_CHANNEL); - - Tensor expected_cell, expected_output; expected_cell.Copy(*net.GetOutput("CellCPU")); expected_output.Copy(*net.GetOutput("OutputCPU")); diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 614788d8100ae5080642ab6202fa7b53535a5b75..411f0f16edae58548441d1fc696c3802e6e3bf20 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -31,6 +31,7 @@ #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENCL +#include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/opencl/image/matmul.h" #endif // MACE_ENABLE_OPENCL @@ -351,11 +352,8 @@ class MatMulOp : public MatMulOpBase { public: explicit MatMulOp(OpConstructContext *context) : MatMulOpBase(context) { - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::MatMulKernel); - } else { - MACE_NOT_IMPLEMENTED; - } + MACE_UNUSED(context); + MACE_NOT_IMPLEMENTED; } MaceStatus Run(OpContext *context) override { Validate(); diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc index 2d62d86a8690485df773616d65a122eaf7ac77fa..f118e63f4680b68f0f77bc55697cf318f729caaa 100644 --- a/mace/ops/matmul_benchmark.cc +++ b/mace/ops/matmul_benchmark.cc @@ -301,26 +301,12 @@ void MatMulBenchmark( net.GetTensor("A")->SetScale(0.1); net.GetTensor("B")->SetScale(0.1); } - if (D == DeviceType::GPU) { - BufferToImage(&net, "A", "AImage", - ops::BufferType::IN_OUT_WIDTH); - BufferToImage(&net, "B", "BImage", - ops::BufferType::IN_OUT_HEIGHT); - - OpDefBuilder("MatMul", "MatMulBM") - .Input("AImage") - .Input("BImage") - .Output("Output") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("MatMul", "MatMulBM") - .Input("A") - .Input("B") - .Output("Output") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("MatMul", "MatMulBM") + .Input("A") + .Input("B") + .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.Setup(D); if (DataTypeToEnum::value == DT_UINT8) { @@ -401,8 +387,6 @@ void MatMulTransposeBenchmark( #define MACE_BM_MATMUL_OP(N, H, C, W) \ MACE_BM_MATMUL_MACRO(N, H, C, W, float, CPU); \ - MACE_BM_MATMUL_MACRO(N, H, C, W, float, GPU); \ - MACE_BM_MATMUL_MACRO(N, H, C, W, half, GPU); \ MACE_BM_MATMUL_MACRO(N, H, C, W, uint8_t, CPU); #define MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, TYPE, DEVICE) \ diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc index 82187b8b1903d2e1b7137be680f5ff4ab1b4e4a8..f2ed8478cdf5801c4d76827cd3e0abc699090cc9 100644 --- a/mace/ops/matmul_test.cc +++ b/mace/ops/matmul_test.cc @@ -36,32 +36,13 @@ void Simple(const std::vector &A_shape, net.AddInputFromArray("A", A_shape, A_value); net.AddInputFromArray("B", B_shape, B_value); - if (D == DeviceType::GPU) { - BufferToImage(&net, "A", "AImage", - ops::BufferType::IN_OUT_WIDTH); - BufferToImage(&net, "B", "BImage", - ops::BufferType::IN_OUT_HEIGHT); - - OpDefBuilder("MatMul", "MatMulTest") - .Input("AImage") - .Input("BImage") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_HEIGHT); - } else { - OpDefBuilder("MatMul", "MatMulTest") - .Input("A") - .Input("B") - .Output("Output") - .Finalize(net.NewOperatorDef()); - // Run - net.RunOp(D); - } + OpDefBuilder("MatMul", "MatMulTest") + .Input("A") + .Input("B") + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); // Check auto expected = net.CreateTensor(C_shape, C_value); @@ -89,129 +70,6 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) { {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64}); } -TEST_F(MatMulOpTest, SimpleOPENCL) { - Simple({1, 2, 3}, {1, 2, 3, 4, 5, 6}, {1, 3, 2}, - {1, 2, 3, 4, 5, 6}, {1, 2, 2}, {22, 28, 49, 64}); - Simple( - {1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, - {1, 5, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, - {1, 5, 5}, {215, 230, 245, 260, 275, 490, 530, 570, 610, - 650, 765, 830, 895, 960, 1025, 1040, 1130, 1220, - 1310, 1400, 1315, 1430, 1545, 1660, 1775}); -} - -TEST_F(MatMulOpTest, SimpleGPUWithBatch) { - Simple({2, 2, 3}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, - {2, 3, 2}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, - {2, 2, 2}, {22, 28, 49, 64, 22, 28, 49, 64}); -} - -namespace { -template -void Complex(const std::vector &batch, - const index_t height, - const index_t channels, - const index_t out_width) { - srand(time(NULL)); - - // Construct graph - OpsTestNet net; - - // Add input data - index_t batch_count = std::accumulate(batch.begin(), batch.end(), 1, - std::multiplies()); - net.AddRandomInput("A", - {batch_count, height, channels}); - net.AddRandomInput( - "B", {batch_count, channels, out_width}); - - // Run on opencl - BufferToImage(&net, "A", "AImage", - ops::BufferType::IN_OUT_WIDTH); - BufferToImage(&net, "B", "BImage", - ops::BufferType::IN_OUT_HEIGHT); - - OpDefBuilder("MatMul", "MatMulTest") - .Input("AImage") - .Input("BImage") - .Output("OutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - net.RunOp(DeviceType::GPU); - - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_HEIGHT); - - // run cpu - std::vector shape_a = batch; - shape_a.push_back(height); - shape_a.push_back(channels); - std::vector shape_b = batch; - shape_b.push_back(channels); - shape_b.push_back(out_width); - std::vector expected_output_shape = batch; - expected_output_shape.push_back(height); - expected_output_shape.push_back(out_width); - - net.GetTensor("A")->Reshape(shape_a); - net.GetTensor("B")->Reshape(shape_b); - - OpDefBuilder("MatMul", "MatMulTest") - .Input("A") - .Input("B") - .Output("Output") - .Finalize(net.NewOperatorDef()); - - net.RunOp(); - - // Check - EXPECT_EQ(expected_output_shape, net.GetOutput("Output")->shape()); - - auto expected = net.CreateTensor(); - expected->Copy(*net.GetOutput("Output")); - expected->Reshape({batch_count, height, out_width}); - - if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-2, - 1e-1); - } else { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5, - 1e-5); - } -} -} // namespace - -TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) { - Complex({1}, 64, 128, 32); - Complex({1}, 64, 32, 128); - Complex({2, 3}, 64, 32, 128); -} -TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) { - Complex({1}, 31, 113, 61); - Complex({1}, 113, 31, 73); - Complex({2, 3}, 113, 31, 73); -} -TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) { - Complex({2}, 3, 3, 3); - Complex({16}, 31, 61, 67); - Complex({31}, 31, 61, 67); - Complex({2, 3}, 31, 61, 67); -} -TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) { - Complex({1}, 64, 128, 32); - Complex({1}, 64, 32, 128); - Complex({2, 3}, 64, 32, 128); -} -TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) { - Complex({2}, 31, 113, 61); - Complex({16}, 32, 64, 64); - Complex({31}, 31, 61, 67); - Complex({2, 3}, 31, 61, 67); -} - namespace { void QuantOutputUint8(const std::vector &batch, const index_t height, diff --git a/mace/ops/opencl/buffer/buffer_inverse_transform.h b/mace/ops/opencl/buffer/buffer_inverse_transform.h deleted file mode 100644 index 8b05bf5f0c34e801d501b390b05f64cb4b7e29c8..0000000000000000000000000000000000000000 --- a/mace/ops/opencl/buffer/buffer_inverse_transform.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ -#define MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ - -#include "mace/ops/opencl/buffer_transformer.h" - -#include "mace/core/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" - -namespace mace { -namespace ops { -namespace opencl { -namespace buffer { - -MaceStatus BufferTypeTransform( - OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const DataType dt, - Tensor *output); - -template -class BufferInverseTransform: public OpenCLBufferTransformKernel { - public: - MaceStatus Compute(OpContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output) override; - private: - cl::Kernel kernel_; -}; - -template -MaceStatus BufferInverseTransform::Compute(OpContext *context, - const Tensor *input, - const BufferType type, - const int wino_blk_size, - Tensor *output) { - MACE_UNUSED(type); - MACE_UNUSED(wino_blk_size); - const DataType dt = DataTypeToEnum::value; - if (input->dtype() != output->dtype()) { - return BufferTypeTransform(context, &kernel_, input, dt, output); - } else { - SetFutureDefaultWaitFn(context->future()); - output->ReuseTensorBuffer(*input); - return MaceStatus::MACE_SUCCESS; - } -} - -} // namespace buffer -} // namespace opencl -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_ diff --git a/mace/ops/opencl/buffer/buffer_transform.cc b/mace/ops/opencl/buffer/buffer_transform.cc index 29f467e2d0d3292508eb5fa4997492b61176642f..9ba3f81d1e7b59bd1c7b0b015616da1cec775ac7 100644 --- a/mace/ops/opencl/buffer/buffer_transform.cc +++ b/mace/ops/opencl/buffer/buffer_transform.cc @@ -91,8 +91,6 @@ MaceStatus TransformConv2DFilter( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION - // Mark the buffer unused. - const_cast(input)->MarkUnused(); return MaceStatus::MACE_SUCCESS; } @@ -159,8 +157,6 @@ MaceStatus TransformDWConv2DFilter( MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key, gws, lws, context->future())); MACE_OUT_OF_RANGE_VALIDATION - // Mark the buffer unused. - const_cast(input)->MarkUnused(); return MaceStatus::MACE_SUCCESS; } @@ -230,8 +226,6 @@ MaceStatus TransformArgument( } }; } - // Mark the buffer unused. - const_cast(input)->MarkUnused(); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h index c9e31cfa04432d3b2758a13993e0850224d7cf43..7f9eae2125be87790151a26f404cb4119890ecd2 100644 --- a/mace/ops/opencl/buffer/buffer_transform.h +++ b/mace/ops/opencl/buffer/buffer_transform.h @@ -63,7 +63,7 @@ class BufferTransform: public OpenCLBufferTransformKernel { MaceStatus Compute( OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) override; @@ -75,7 +75,7 @@ class BufferTransform: public OpenCLBufferTransformKernel { template MaceStatus BufferTransform::Compute(OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) { MACE_UNUSED(type); @@ -92,8 +92,8 @@ MaceStatus BufferTransform::Compute(OpContext *context, if (input->dtype() != dt) { return BufferTypeTransform(context, &kernel_, input, dt, output); } else { - SetFutureDefaultWaitFn(context->future()); - output->ReuseTensorBuffer(*input); + LOG(FATAL) << "Should not reach here. " << input->name() + << "<" << type << "> to " << output->name(); return MaceStatus::MACE_SUCCESS; } } diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc index d1d52fe4152e7033755517c958cff35b659eebfd..ce405e9f3da2865c4a2547389f15cdb9434f6996 100644 --- a/mace/ops/opencl/buffer/buffer_type_transform.cc +++ b/mace/ops/opencl/buffer/buffer_type_transform.cc @@ -86,8 +86,6 @@ MaceStatus BufferTypeTransform( } }; } - // Mark the buffer unused. - const_cast(input)->MarkUnused(); return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h index e50d8e5c2ad77b1c4d64ce371f5f6770a4f562ee..dca574047aa79575cd9c7b6b2cabc18f779cb330 100644 --- a/mace/ops/opencl/buffer/conv_2d.h +++ b/mace/ops/opencl/buffer/conv_2d.h @@ -62,6 +62,14 @@ class Conv2dKernel : public OpenCLConv2dKernel { public: Conv2dKernel() : old_scratch_size_(0) {} + bool CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_block_size) override; + MaceStatus Compute( OpContext *context, const Tensor *input, @@ -73,6 +81,7 @@ class Conv2dKernel : public OpenCLConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, + const int winograd_blk_size, Tensor *output) override; private: @@ -82,6 +91,23 @@ class Conv2dKernel : public OpenCLConv2dKernel { std::vector input_shape_; }; + +template +bool Conv2dKernel::CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_block_size) { + MACE_UNUSED(runtime); + MACE_UNUSED(output_shape); + MACE_UNUSED(wino_block_size); + return (filter_shape[2] == 3 && filter_shape[3] == 3 && + strides[0] == 1 && strides[1] == 1 && + dilations[0] == 1 && dilations[1] == 1); +} + template MaceStatus Conv2dKernel::Compute( OpContext *context, @@ -94,7 +120,9 @@ MaceStatus Conv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, + const int winograd_blk_size, Tensor *output) { + MACE_UNUSED(winograd_blk_size); StatsFuture pad_future, conv_future; index_t filter_h = filter->dim(2); index_t filter_w = filter->dim(3); diff --git a/mace/ops/opencl/buffer_transform_kernel.h b/mace/ops/opencl/buffer_transform_kernel.h index 5d4ff09448cfee8f70af71f2365e43525a9e3087..83159eeaa29db37162981d4752c79adc848be20c 100644 --- a/mace/ops/opencl/buffer_transform_kernel.h +++ b/mace/ops/opencl/buffer_transform_kernel.h @@ -15,7 +15,7 @@ #ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ #define MACE_OPS_OPENCL_BUFFER_TRANSFORM_KERNEL_H_ -#include "mace/ops/opencl/common.h" +#include "mace/core/runtime/opencl/opencl_util.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" @@ -27,10 +27,10 @@ class OpenCLBufferTransformKernel { public: virtual MaceStatus Compute(OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) + MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel) }; } // namespace ops } // namespace mace diff --git a/mace/ops/opencl/buffer_transformer.cc b/mace/ops/opencl/buffer_transformer.cc index e3b1b67b417b83879b1949874afa2624795d31f0..1176df1303cfb552aa1e880d855dfb9065e2d245 100644 --- a/mace/ops/opencl/buffer_transformer.cc +++ b/mace/ops/opencl/buffer_transformer.cc @@ -17,7 +17,7 @@ namespace mace { namespace ops { -std::string TransformedName(const std::string &name) { +std::string TransformedFilterName(const std::string &name) { // TODO(liuqi): This may create a conflict. const char *postfix = "_mace_identity_transformed"; return name + postfix; diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h index 78f82177ee3fd091cf55bfcd79a815f1ebaa925d..7acc39a90d7ffb7c89f7d3407402cd27ab19efb6 100644 --- a/mace/ops/opencl/buffer_transformer.h +++ b/mace/ops/opencl/buffer_transformer.h @@ -15,11 +15,15 @@ #ifndef MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ #define MACE_OPS_OPENCL_BUFFER_TRANSFORMER_H_ +#include +#include +#include + #include "mace/core/operator.h" -#include "mace/ops/opencl/common.h" #include "mace/ops/opencl/image/buffer_to_image.h" #include "mace/ops/opencl/image/image_to_buffer.h" #include "mace/ops/opencl/buffer/buffer_transform.h" +#include "mace/ops/transpose.h" namespace mace { namespace ops { @@ -28,10 +32,10 @@ template class OpenCLBufferTransformer { public: OpenCLBufferTransformer(const MemoryType in_mem_type, - const MemoryType out_mem_type) { + const MemoryType out_mem_type) { if (out_mem_type == MemoryType::GPU_IMAGE) { kernel_.reset(new opencl::image::BufferToImage); - } else if (in_mem_type == MemoryType::GPU_IMAGE){ + } else if (in_mem_type == MemoryType::GPU_IMAGE) { kernel_.reset(new opencl::image::ImageToBuffer); } else { kernel_.reset(new opencl::buffer::BufferTransform); @@ -40,9 +44,9 @@ class OpenCLBufferTransformer { MaceStatus Transform(OpContext *context, const Tensor *input, - const BufferType type, - const int wino_blk_size, + const OpenCLBufferType type, const MemoryType out_mem_type, + const int wino_blk_size, Tensor *output) { Workspace *ws = context->workspace(); DataType dt = DataTypeToEnum::value; @@ -54,39 +58,81 @@ class OpenCLBufferTransformer { context, input, type, wino_blk_size, output); } else { // convert to the GPU Buffer with the input's data type. + // 1. CPU buffer to GPU Buffer Tensor *internal_tensor = ws->CreateTensor( InternalTransformedName(input->name()), context->device()->allocator(), input->dtype()); - output->Resize(input->shape()); - const uint8_t *input_ptr = input->data(); - Tensor::MappingGuard guard(internal_tensor); - uint8_t *internal_ptr = internal_tensor->mutable_data(); - memcpy(internal_ptr, input_ptr, input->raw_size()); - // convert the internal GPU Buffer to output. + VLOG(2) << "Transform CPU Buffer " << input->name() + << " to GPU Buffer " << internal_tensor->name() + << " with data type " << dt; + if (input->shape().size() == 4) { + // 1. (NCHW -> NHWC) + std::vector dst_dims = {0, 2, 3, 1}; + std::vector output_shape = + TransposeShape(input->shape(), + dst_dims); + internal_tensor->Resize(output_shape); + // TODO(liuqi): Only support float now + const float *input_ptr = input->data(); + Tensor::MappingGuard guard(internal_tensor); + float *internal_ptr = internal_tensor->mutable_data(); + MACE_RETURN_IF_ERROR(ops::Transpose(input_ptr, + input->shape(), + dst_dims, + internal_ptr)); + } else { + internal_tensor->Resize(input->shape()); + const uint8_t *input_ptr = input->data(); + Tensor::MappingGuard guard(internal_tensor); + uint8_t *internal_ptr = internal_tensor->mutable_data(); + memcpy(internal_ptr, input_ptr, input->raw_size()); + } + // 2. convert the internal GPU Buffer to output. return kernel_->Compute( context, internal_tensor, type, wino_blk_size, output); } - } else { // out_mem_type == MemoryType::CPU_BUFFER - // convert to the GPU Buffer with the output's data type. + } else if (out_mem_type == MemoryType::CPU_BUFFER) { + // 1. convert to the GPU Buffer with the output's data type. Tensor internal_tensor(context->device()->allocator(), dt, false, InternalTransformedName(input->name())); MACE_RETURN_IF_ERROR(kernel_->Compute( context, input, type, wino_blk_size, &internal_tensor)); - // convert the internal GPU Buffer to output. - Tensor::MappingGuard guard(&internal_tensor); - const T *internal_ptr = internal_tensor.data(); - output->Resize(internal_tensor.shape()); - T *output_ptr = output->mutable_data(); - memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + // 2. convert the internal GPU Buffer to output. + VLOG(2) << "Transform GPU Buffer " << internal_tensor.name() + << " to CPU Buffer " << output->name() + << " with data type " << dt; + if (internal_tensor.shape().size() == 4) { + // NHWC -> NCHW + std::vector dst_dims = {0, 3, 1, 2}; + std::vector output_shape = + TransposeShape(internal_tensor.shape(), + dst_dims); + Tensor::MappingGuard guard(&internal_tensor); + const float *internal_ptr = internal_tensor.data(); + output->Resize(output_shape); + float *output_ptr = output->mutable_data(); + return ops::Transpose(internal_ptr, + internal_tensor.shape(), + dst_dims, + output_ptr); + } else { + Tensor::MappingGuard guard(&internal_tensor); + const T *internal_ptr = internal_tensor.data(); + output->Resize(internal_tensor.shape()); + T *output_ptr = output->mutable_data(); + memcpy(output_ptr, internal_ptr, internal_tensor.size() * sizeof(T)); + return MaceStatus::MACE_SUCCESS; + } + } else { + LOG(FATAL) << "Unexpected error: " << out_mem_type; return MaceStatus::MACE_SUCCESS; } } private: std::string InternalTransformedName(const std::string &name) { - // TODO(liuqi): This may create a conflict. const char *postfix = "_mace_identity_internal"; return name + postfix; } @@ -95,29 +141,31 @@ class OpenCLBufferTransformer { std::unique_ptr kernel_; }; -std::string TransformedName(const std::string &name); +std::string TransformedFilterName(const std::string &name); template MaceStatus TransformFilter( mace::OpConstructContext *context, OperatorDef *op_def, const int input_idx, - const BufferType buffer_type, - const MemoryType mem_type) { + const OpenCLBufferType buffer_type, + const MemoryType mem_type, + const int wino_blk_size = 0) { const DataType dt = DataTypeToEnum::value; OpContext op_context(context->workspace(), context->device()); Workspace *ws = context->workspace(); std::string input_name = op_def->input(input_idx); Tensor *input = ws->GetTensor(input_name); - std::string output_name = TransformedName(input_name); + std::string output_name = TransformedFilterName(input_name); Tensor *output = - ws->CreateTensor(output_name, context->device()->allocator(), dt); + ws->CreateTensor(output_name, context->device()->allocator(), dt, true); // update the information op_def->set_input(input_idx, output_name); input->MarkUnused(); return OpenCLBufferTransformer(input->memory_type(), mem_type). - Transform(&op_context, input, buffer_type, 0, mem_type, output); + Transform(&op_context, input, buffer_type, mem_type, wino_blk_size, + output); } } // namespace ops diff --git a/mace/ops/opencl/conv_2d.h b/mace/ops/opencl/conv_2d.h index cf0911f79ee6cff726383f804a590bf42ed2b229..03f2cd49861fec380d2effc6fc88a2e8e6d580de 100644 --- a/mace/ops/opencl/conv_2d.h +++ b/mace/ops/opencl/conv_2d.h @@ -26,6 +26,14 @@ class OpContext; namespace ops { class OpenCLConv2dKernel { public: + virtual bool CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_block_size) = 0; + virtual MaceStatus Compute( OpContext *context, const Tensor *input, @@ -37,6 +45,7 @@ class OpenCLConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, + const int winograd_blk_size, Tensor *output) = 0; MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel); }; diff --git a/mace/ops/opencl/depthwise_deconv2d.h b/mace/ops/opencl/depthwise_deconv2d.h index 994c98a21f012dd6dbc72f8c39ad0ef0e2e839d3..4238f0d279e746ab5a7efb564878181b68ddae8e 100644 --- a/mace/ops/opencl/depthwise_deconv2d.h +++ b/mace/ops/opencl/depthwise_deconv2d.h @@ -15,6 +15,7 @@ #ifndef MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_ #define MACE_OPS_OPENCL_DEPTHWISE_DECONV2D_H_ +#include #include #include "mace/ops/activation.h" diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc index 7eb392a8ea05569ba266c660f89328325fdb3c8e..11487b14446c08b3a086c2e8a0284f8ec28ccf24 100644 --- a/mace/ops/opencl/helper.cc +++ b/mace/ops/opencl/helper.cc @@ -24,136 +24,9 @@ namespace mace { namespace ops { -namespace { -// [(C + 3) / 4 * W, N * H] -void CalInOutputImageShape(const std::vector &shape, /* NHWC */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2]; - (*image_shape)[1] = shape[0] * shape[1]; -} - -// [Ic, H * W * (Oc + 3) / 4] -void CalConv2dFilterImageShape(const std::vector &shape, /* OIHW */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = shape[1]; - (*image_shape)[1] = shape[2] * shape[3] * RoundUpDiv4(shape[0]); -} - -// [H * W * M, (Ic + 3) / 4] -void CalDepthwiseConv2dFilterImageShape( - const std::vector &shape, /* MIHW */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = shape[0] * shape[2] * shape[3]; - (*image_shape)[1] = RoundUpDiv4(shape[1]); -} - -// [(size + 3) / 4, 1] -void CalArgImageShape(const std::vector &shape, - std::vector *image_shape) { - MACE_CHECK(shape.size() == 1); - image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[0]); - (*image_shape)[1] = 1; -} - -// Only support 3x3 now -// [ (Ic + 3) / 4, 16 * Oc] -void CalWinogradFilterImageShape( - const std::vector &shape, /* Oc, Ic, H, W*/ - std::vector *image_shape, - const int blk_size) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[1]); - (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2)); -} - - -// [W * C, N * RoundUp<4>(H)] -void CalInOutHeightImageShape(const std::vector &shape, /* NHWC */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = shape[2] * shape[3]; - (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]); -} - -// [RoundUp<4>(W) * C, N * H] -void CalInOutWidthImageShape(const std::vector &shape, /* NHWC */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3]; - (*image_shape)[1] = shape[0] * shape[1]; -} - -// [Ic * H * W, (Oc + 3) / 4] -void CalWeightHeightImageShape(const std::vector &shape, /* OIHW */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = shape[1] * shape[2] * shape[3]; - (*image_shape)[1] = RoundUpDiv4(shape[0]); -} - -// [(Ic + 3) / 4 * H * W, Oc] -void CalWeightWidthImageShape(const std::vector &shape, /* OIHW */ - std::vector *image_shape) { - MACE_CHECK(shape.size() == 4); - image_shape->resize(2); - (*image_shape)[0] = RoundUpDiv4(shape[1]) * shape[2] * shape[3]; - (*image_shape)[1] = shape[0]; -} -} // namespace - -void CalImage2DShape(const std::vector &shape, /* NHWC */ - const BufferType type, - std::vector *image_shape, - const int wino_block_size) { - MACE_CHECK_NOTNULL(image_shape); - switch (type) { - case CONV2D_FILTER: - CalConv2dFilterImageShape(shape, image_shape); - break; - case DW_CONV2D_FILTER: - CalDepthwiseConv2dFilterImageShape(shape, image_shape); - break; - case IN_OUT_CHANNEL: - CalInOutputImageShape(shape, image_shape); - break; - case ARGUMENT: - CalArgImageShape(shape, image_shape); - break; - case IN_OUT_HEIGHT: - CalInOutHeightImageShape(shape, image_shape); - break; - case IN_OUT_WIDTH: - CalInOutWidthImageShape(shape, image_shape); - break; - case WINOGRAD_FILTER: - CalWinogradFilterImageShape(shape, image_shape, wino_block_size); - break; - case WEIGHT_HEIGHT: - CalWeightHeightImageShape(shape, image_shape); - break; - case WEIGHT_WIDTH: - CalWeightWidthImageShape(shape, image_shape); - break; - default: - LOG(FATAL) << "Mace not supported yet."; - } -} - std::vector FormatBufferShape( const std::vector &buffer_shape, - const BufferType type) { - + const OpenCLBufferType type) { const size_t buffer_shape_size = buffer_shape.size(); switch (type) { case IN_OUT_CHANNEL: diff --git a/mace/ops/opencl/helper.h b/mace/ops/opencl/helper.h index d4b5aa51a0f6a53da15e4862135bf6d0b6fd721e..e2f51a43d7dab565067d1f8bf450fc3a97f060c8 100644 --- a/mace/ops/opencl/helper.h +++ b/mace/ops/opencl/helper.h @@ -24,8 +24,8 @@ #include "mace/core/macros.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/runtime/opencl/opencl_util.h" #include "mace/core/types.h" -#include "mace/ops/opencl/common.h" #include "mace/utils/utils.h" namespace mace { @@ -92,14 +92,9 @@ const float kMaxKernelExecTime = 1000.0; // microseconds // Base GPU cache size used for computing local work group size. const int32_t kBaseGPUMemCacheSize = 16384; -void CalImage2DShape(const std::vector &shape, /* NHWC */ - const BufferType type, - std::vector *image_shape, - const int wino_blk_size = 2); - std::vector FormatBufferShape( const std::vector &buffer_shape, - const BufferType type); + const OpenCLBufferType type); // CPU data type to OpenCL command data type std::string DtToCLCMDDt(const DataType dt); diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h index bde9c6b06ff8bc7bfcfc63fcd5cd324d7f23cb83..7692ac06b8e281295381b7ecf77d446784988859 100644 --- a/mace/ops/opencl/image/addn.h +++ b/mace/ops/opencl/image/addn.h @@ -101,8 +101,8 @@ MaceStatus AddNKernel::Compute( MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR( output_tensor->ResizeImage(output_shape, output_image_shape)); diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h index b92c9a4eea17bfb1ea5df710de9485c1e5293b7c..9d91802627c840538b70d5a4f994d3ca572e8504 100644 --- a/mace/ops/opencl/image/batch_to_space.h +++ b/mace/ops/opencl/image/batch_to_space.h @@ -56,8 +56,8 @@ MaceStatus BatchToSpaceKernel::Compute( const std::vector &output_shape, Tensor *space_tensor) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR( space_tensor->ResizeImage(output_shape, output_image_shape)); diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h index e84691f85d1149f5cc87cbc6659b80ae786f2c71..14a0ae4b3e474eb464580701446346248f5d1982 100644 --- a/mace/ops/opencl/image/buffer_to_image.h +++ b/mace/ops/opencl/image/buffer_to_image.h @@ -36,7 +36,7 @@ class BufferToImage : public OpenCLBufferTransformKernel { MaceStatus Compute( OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) override; @@ -49,20 +49,16 @@ template MaceStatus BufferToImage::Compute( OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) { auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; - CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); - if (type == WINOGRAD_FILTER) { - std::vector new_shape = - {(wino_blk_size + 2) * (wino_blk_size + 2), - input->dim(0), input->dim(1)}; - MACE_RETURN_IF_ERROR(output->ResizeImage(new_shape, image_shape)); - } else { - MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape)); - } + OpenCLUtil::CalImage2DShape(formatted_buffer_shape, + type, + &image_shape, + wino_blk_size); + MACE_RETURN_IF_ERROR(output->ResizeImage(input->shape(), image_shape)); uint32_t gws[2] = {static_cast(image_shape[0]), static_cast(image_shape[1])}; @@ -196,9 +192,6 @@ MaceStatus BufferToImage::Compute( }; } - // Mark the buffer unused. - const_cast(input)->MarkUnused(); - return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h index f12ad25cddecaa85989921445f8e0c258c83989a..c7f5e099168f43182cdb9e7bb39ac9df0dbdaeb6 100644 --- a/mace/ops/opencl/image/concat.h +++ b/mace/ops/opencl/image/concat.h @@ -92,7 +92,9 @@ MaceStatus ConcatKernel::Compute( inputs_count == 2 || divisible_four, "Dimensions of inputs should be divisible by 4 when inputs_count > 2."); std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); switch (inputs_count) { diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h index 224432e894fe7e25f873ac45a725ea2e8de13571..51c9d1dfe8a6f9ddbf2fccbae600576d536c5301 100644 --- a/mace/ops/opencl/image/conv_2d.h +++ b/mace/ops/opencl/image/conv_2d.h @@ -28,55 +28,76 @@ namespace ops { namespace opencl { namespace image { -extern MaceStatus Conv2dOpenclK1x1(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size); - -extern MaceStatus Conv2dOpenclK3x3(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size); - -extern MaceStatus Conv2dOpencl(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size); +extern MaceStatus Conv2dK1x1(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size); +extern MaceStatus Conv2dK3x3(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size); + +extern MaceStatus Conv2d(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size); + +extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, + cl::Kernel *kernels[3], + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *padding, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + const int wino_blk_size, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size[3]); template class Conv2dKernel : public OpenCLConv2dKernel { public: + bool CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_block_size) override; + MaceStatus Compute( OpContext *context, const Tensor *input, @@ -88,14 +109,54 @@ class Conv2dKernel : public OpenCLConv2dKernel { const int *dilations, const ActivationType activation, const float relux_max_limit, + const int wino_blk_size, Tensor *output) override; private: - cl::Kernel kernel_; - uint32_t kwg_size_; + cl::Kernel kernels_[3]; + uint32_t kwg_size_[3]; std::vector input_shape_; }; +template +bool Conv2dKernel::CheckUseWinograd( + OpenCLRuntime *runtime, + const std::vector &filter_shape, + const std::vector &output_shape, + const int *strides, + const int *dilations, + int *wino_blk_size) { + if (filter_shape[2] != 3 || filter_shape[3] != 3 || + strides[0] > 1 || strides[1] > 1 || + dilations[0] > 1 || dilations[1] > 1) { + return false; + } + index_t out_channels = filter_shape[0]; + index_t in_channels = filter_shape[1]; + auto opencl_image_max_size = runtime->GetMaxImage2DSize(); + auto check_opencl_limit = [&](int block_size) -> bool { + int sqr_block = (block_size + 2) * (block_size + 2); + uint64_t transformed_width = static_cast(output_shape[0] * + ((output_shape[1] + block_size - 1) / block_size) * + ((output_shape[2] + block_size - 1) / block_size)); + return (transformed_width < opencl_image_max_size[0] && + static_cast(sqr_block * in_channels) + < opencl_image_max_size[1] && + static_cast(sqr_block * out_channels) + < opencl_image_max_size[1]); + }; + // GPU only supports 4x4 and 2x2 gpu winograd convolution + if (*wino_blk_size == 4) { + // if block size == 4 exceed OpenCL image size limitation, fallback to 2 + if (!check_opencl_limit(4)) { + *wino_blk_size = 2; + } else { + return true; + } + } + return check_opencl_limit(2); +} + template MaceStatus Conv2dKernel::Compute( OpContext *context, @@ -108,19 +169,8 @@ MaceStatus Conv2dKernel::Compute( const int *dilations, const ActivationType activation, const float relux_max_limit, + const int wino_blk_size, Tensor *output) { - typedef MaceStatus (*Conv2dOpenclFunction)( - OpContext *context, - cl::Kernel *kernel, const Tensor *input, const Tensor *filter, - const Tensor *bias, const int stride, const int *padding, - const int *dilations, const ActivationType activation, - const float relux_max_limit, const DataType dt, - std::vector *input_shape, Tensor *output, - uint32_t *kwg_size); - // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[3] = { - Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3}; - index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); if (strides[0] != strides[1] || @@ -148,24 +198,85 @@ MaceStatus Conv2dKernel::Compute( } std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - if (kernel_h == kernel_w && kernel_h <= 3 && - selector[kernel_h - 1] != nullptr) { - auto conv2d_func = selector[kernel_h - 1]; - return conv2d_func(context, - &kernel_, input, filter, bias, strides[0], paddings.data(), dilations, - activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, - output, &kwg_size_); + std::function conv_func; + + if (wino_blk_size != 0) { + // use winograd covolution + conv_func = [&]() -> MaceStatus { + cl::Kernel *kernels[3] = {&kernels_[0], &kernels_[1], &kernels_[2]}; + uint32_t *kwg_size[3] = {&kwg_size_[0], &kwg_size_[1], &kwg_size_[2]}; + return WinogradConv2dK3x3S1(context, + kernels, + input, + filter, + bias, + paddings.data(), + activation, + relux_max_limit, + DataTypeToEnum::value, + wino_blk_size, + &input_shape_, + output, + kwg_size); + }; + } else if (kernel_h == 1 && kernel_w == 1) { + conv_func = [&]() -> MaceStatus { + return Conv2dK1x1(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + DataTypeToEnum::value, + &input_shape_, + output, + &kwg_size_[0]); + }; + } else if (kernel_h == 3 && kernel_w == 3) { + conv_func = [&]() -> MaceStatus { + return Conv2dK3x3(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + DataTypeToEnum::value, + &input_shape_, + output, + &kwg_size_[0]); + }; } else { - return Conv2dOpencl( - context, &kernel_, input, filter, bias, - strides[0], paddings.data(), dilations, - activation, relux_max_limit, DataTypeToEnum::value, &input_shape_, - output, &kwg_size_); + conv_func = [&]() -> MaceStatus { + return Conv2d(context, + &kernels_[0], + input, + filter, + bias, + strides[0], + paddings.data(), + dilations, + activation, + relux_max_limit, + DataTypeToEnum::value, + &input_shape_, + output, + &kwg_size_[0]); + }; } + + return conv_func(); } } // namespace image diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc index 74a7ddc9ace77bb5b2abfa2608cdb8aee35ea842..f88882ee645814f81d13bef5cd80ef9ebcb5092f 100644 --- a/mace/ops/opencl/image/conv_2d_1x1.cc +++ b/mace/ops/opencl/image/conv_2d_1x1.cc @@ -66,20 +66,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpenclK1x1(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +extern MaceStatus Conv2dK1x1(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { MACE_UNUSED(padding); MACE_UNUSED(dilations); const index_t batch = output->dim(0); diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc index 42a2a81ef3077197b0752b818cc6a34f48f6a233..3e5aee909c89bbed8e94488c5d38d8be3f93615d 100644 --- a/mace/ops/opencl/image/conv_2d_3x3.cc +++ b/mace/ops/opencl/image/conv_2d_3x3.cc @@ -59,20 +59,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpenclK3x3(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +extern MaceStatus Conv2dK3x3(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc index 9b577c2b08f1b2cfd1bb90b266b8cb45b9e72f5a..120a3daa3067d91118c101e8b95798f7bde84a1d 100644 --- a/mace/ops/opencl/image/conv_2d_general.cc +++ b/mace/ops/opencl/image/conv_2d_general.cc @@ -67,20 +67,20 @@ std::vector LocalWS(OpenCLRuntime *runtime, } // namespace -extern MaceStatus Conv2dOpencl(OpContext *context, - cl::Kernel *kernel, - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const int stride, - const int *padding, - const int *dilations, - const ActivationType activation, - const float relux_max_limit, - const DataType dt, - std::vector *prev_input_shape, - Tensor *output, - uint32_t *kwg_size) { +extern MaceStatus Conv2d(OpContext *context, + cl::Kernel *kernel, + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int stride, + const int *padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h index 95a989a1b901b73712efc4db463b403f85817b66..c8f98a4ca7a2f2cdf8ba96135444e31e25ed1867 100644 --- a/mace/ops/opencl/image/crop.h +++ b/mace/ops/opencl/image/crop.h @@ -129,7 +129,9 @@ MaceStatus CropKernel::Compute( << input1->dim(i) << "and offset" << offsets[i]; } std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); const index_t offset_chan_blk = RoundUpDiv4(offsets[3]); diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h index a0f51874dc2b1de7bbd3d7cd8be9dad8328be49e..f3d6cbe92049380634540ae94419b96a2a1444e1 100644 --- a/mace/ops/opencl/image/deconv_2d.h +++ b/mace/ops/opencl/image/deconv_2d.h @@ -64,8 +64,8 @@ MaceStatus Deconv2dKernel::Compute( const std::vector &output_shape, Tensor *output) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); const DataType dt = DataTypeToEnum::value; const index_t batch = output->dim(0); diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h index 2ed253df28dc47c63bbb47f1009f8b2a70f80f74..77c4bd53dfc661fd23381d9e8ebac3cf33c15017 100644 --- a/mace/ops/opencl/image/depth_to_space.h +++ b/mace/ops/opencl/image/depth_to_space.h @@ -77,7 +77,9 @@ MaceStatus DepthToSpaceKernel::Compute( output_width, output_depth}; std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); const uint32_t gws[3] = { diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h index e818b039caac7e8a207fde5aef88c01bbe4f9bd7..c4ee3cb79ea54424938206cb16b2ec63a54c8cc9 100644 --- a/mace/ops/opencl/image/depthwise_conv2d.h +++ b/mace/ops/opencl/image/depthwise_conv2d.h @@ -112,8 +112,8 @@ MaceStatus DepthwiseConv2dKernel::Compute( } std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); return depthwise::DepthwiseConv2d( diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h index 040c349d4382864a100060b539e4a323529963ee..96fdfa51e110395f3028003f3058a029765519f5 100644 --- a/mace/ops/opencl/image/depthwise_deconv2d.h +++ b/mace/ops/opencl/image/depthwise_deconv2d.h @@ -76,8 +76,8 @@ MaceStatus DepthwiseDeconv2dKernel::Compute( "opencl image deconv only supports depthwise type group."); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); const DataType dt = DataTypeToEnum::value; diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h index 25235a442ffb0a5cab8ca90ddc29a8fb9caead88..2afb334233731307582d83ea77d2ec1ad77ce661 100644 --- a/mace/ops/opencl/image/eltwise.h +++ b/mace/ops/opencl/image/eltwise.h @@ -101,8 +101,8 @@ MaceStatus EltwiseKernel::Compute( output_shape[3] = input0->dim(3); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); const index_t batch = output->dim(0); diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h index 2d8fbb88ccf4abbfe46c9a23056af611ec59bc6a..962ffaf082ca93e1f6129fa2f5d123c0e3454603 100644 --- a/mace/ops/opencl/image/fully_connected.h +++ b/mace/ops/opencl/image/fully_connected.h @@ -60,8 +60,8 @@ MaceStatus FullyConnectedKernel::Compute( Tensor *output) { std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); auto runtime = context->device()->opencl_runtime(); diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h index 4200087eeeb6052e2e36a1f63e9ce373dd773cd6..6ca73fa6af9b8a39c43d6586d9167ca8655d6ffa 100644 --- a/mace/ops/opencl/image/image_to_buffer.h +++ b/mace/ops/opencl/image/image_to_buffer.h @@ -33,7 +33,7 @@ class ImageToBuffer : public OpenCLBufferTransformKernel { public: MaceStatus Compute(OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) override; @@ -45,12 +45,15 @@ class ImageToBuffer : public OpenCLBufferTransformKernel { template MaceStatus ImageToBuffer::Compute(OpContext *context, const Tensor *input, - const BufferType type, + const OpenCLBufferType type, const int wino_blk_size, Tensor *output) { auto formatted_buffer_shape = FormatBufferShape(input->shape(), type); std::vector image_shape; - CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size); + OpenCLUtil::CalImage2DShape(formatted_buffer_shape, + type, + &image_shape, + wino_blk_size); MACE_RETURN_IF_ERROR(output->Resize(input->shape())); uint32_t gws[2] = {static_cast(image_shape[0]), diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h index 967c4bf4c101e31f0f88413e216e82bf87d804ae..546b4a792de1c892a3fd9d6c0e11f255b9cb7501 100644 --- a/mace/ops/opencl/image/lstm_cell.h +++ b/mace/ops/opencl/image/lstm_cell.h @@ -98,8 +98,9 @@ MaceStatus LSTMCellKernel::Compute( if (!IsVecEqual(input_shape_, input->shape())) { std::vector output_shape_padded = {height, 1, 1, hidden_units}; std::vector output_image_shape; - CalImage2DShape(output_shape_padded, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape_padded, + OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(pre_output->shape(), output_image_shape)); MACE_RETURN_IF_ERROR(cell->ResizeImage(pre_cell->shape(), diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h index 899df5a5a8899cade7608e1895208113c4c26d00..763082f610f5b4a115a76fc55be08c459a278d14 100644 --- a/mace/ops/opencl/image/matmul.h +++ b/mace/ops/opencl/image/matmul.h @@ -70,7 +70,9 @@ MaceStatus MatMulKernel::Compute( c_shape[rank - 1] = width; std::vector c_image_shape; std::vector padded_c_shape = {batch, height, width, 1}; - CalImage2DShape(padded_c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape); + OpenCLUtil::CalImage2DShape(padded_c_shape, + OpenCLBufferType::IN_OUT_HEIGHT, + &c_image_shape); MACE_RETURN_IF_ERROR(C->ResizeImage(c_shape, c_image_shape)); const index_t height_blocks = RoundUpDiv4(height); diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h index c96d964a5623f68f1df7d441ec61ff675d218296..cb0c390b667a46329ab4f9728caeea10f1eea0c7 100644 --- a/mace/ops/opencl/image/pad.h +++ b/mace/ops/opencl/image/pad.h @@ -68,7 +68,9 @@ MaceStatus PadKernel::Compute( input_shape[3] + this->paddings_[6] + this->paddings_[7]}; std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); const index_t batch = output->dim(0); diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h index 1384b54b812e85bb20d75aa9f25a9dbcb257f44d..f246efa426618e9c197f30d253e23338bd11f73d 100644 --- a/mace/ops/opencl/image/pooling.h +++ b/mace/ops/opencl/image/pooling.h @@ -108,8 +108,8 @@ MaceStatus PoolingKernel::Compute( } std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); auto runtime = context->device()->opencl_runtime(); diff --git a/mace/ops/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h index 953742cbbec2e24f257f28d4684a80729cadf9ac..95b51d86f883338fd0e4e57952edfd5965f85a61 100644 --- a/mace/ops/opencl/image/reduce_mean.h +++ b/mace/ops/opencl/image/reduce_mean.h @@ -72,8 +72,8 @@ MaceStatus ReduceMeanKernel::Compute( std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); auto runtime = context->device()->opencl_runtime(); diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h index e801e59f35c3514b7d02ff1173899eb9a2466a4e..bf5bfcf1921254c3939f77a5f3dc7711ea780289 100644 --- a/mace/ops/opencl/image/resize_bicubic.h +++ b/mace/ops/opencl/image/resize_bicubic.h @@ -133,8 +133,8 @@ MaceStatus ResizeBicubicKernel::Compute( std::vector output_shape{batch, out_height, out_width, channels}; std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); float height_scale = diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h index 7af9a5f60a0b69775923beebc71bc2e7276983cc..b3f1b09c6ee08f356f328e9e729c573abd5bb4e4 100644 --- a/mace/ops/opencl/image/resize_bilinear.h +++ b/mace/ops/opencl/image/resize_bilinear.h @@ -134,8 +134,8 @@ MaceStatus ResizeBilinearKernel::Compute( std::vector output_shape{batch, out_height, out_width, channels}; std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); float height_scale = diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h index 9924f02fdc82fe30527a7c958341f9cd3f2c5540..f2baaba48259da64f2f8ed18620da37edd154245 100644 --- a/mace/ops/opencl/image/space_to_batch.h +++ b/mace/ops/opencl/image/space_to_batch.h @@ -56,8 +56,8 @@ MaceStatus SpaceToBatchKernel::Compute( const std::vector &output_shape, Tensor *batch_tensor) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR( batch_tensor->ResizeImage(output_shape, output_image_shape)); const char *kernel_name = "space_to_batch"; diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h index 961d16066097d9f5448c2d3c61752cb97695e316..e225b37693377acf57f2d91b17cc3269bc8a20a3 100644 --- a/mace/ops/opencl/image/space_to_depth.h +++ b/mace/ops/opencl/image/space_to_depth.h @@ -74,7 +74,9 @@ MaceStatus SpaceToDepthKernel::Compute( output_depth}; std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, image_shape)); auto runtime = context->device()->opencl_runtime(); diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h index 12286a6d737eff94f96ec4d3194b8d2bc5a36d6f..7b7f790597f4daba916a0ab2cc1d103fdf11df26 100644 --- a/mace/ops/opencl/image/split.h +++ b/mace/ops/opencl/image/split.h @@ -34,7 +34,9 @@ namespace image { template class SplitKernel : public OpenCLSplitKernel { public: - explicit SplitKernel(const int32_t axis) : axis_(axis) {} + explicit SplitKernel(const int32_t axis) : axis_(axis) { + MACE_CHECK(axis == 3) << "GPU only support channel-dimension split"; + } MaceStatus Compute( OpContext *context, const Tensor *input, @@ -60,7 +62,9 @@ MaceStatus SplitKernel::Compute( {input->dim(0), input->dim(1), input->dim(2), output_channels}); std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(output_shape, + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); for (size_t i = 0; i < outputs_count; ++i) { MACE_RETURN_IF_ERROR( output_list[i]->ResizeImage(output_shape, image_shape)); diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h index d356b89859ee9a9c24541a1270f919f188be62eb..d0c217fe450018d038e2d617fe4bdf5e6c4ba5de 100644 --- a/mace/ops/opencl/image/sqrdiff_mean.h +++ b/mace/ops/opencl/image/sqrdiff_mean.h @@ -68,8 +68,8 @@ MaceStatus SqrDiffMeanKernel::Compute( std::vector lws(3); std::vector output_shape{batch, 1, 1, channels}; std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, - &output_image_shape); + OpenCLUtil::CalImage2DShape(output_shape, OpenCLBufferType::IN_OUT_CHANNEL, + &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); auto runtime = context->device()->opencl_runtime(); diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9bd717155b7cfed5f5a6cac32a64d57fad63545 --- /dev/null +++ b/mace/ops/opencl/image/winograd_conv2d.cc @@ -0,0 +1,350 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/op_context.h" +#include "mace/ops/activation.h" +#include "mace/ops/conv_pool_2d_util.h" +#include "mace/ops/opencl/helper.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace ops { +namespace opencl { +namespace image { + +namespace { +MaceStatus WinogradInputTransform(OpContext *context, + cl::Kernel *kernel, + const Tensor *input_tensor, + const DataType dt, + const int *paddings, + const index_t round_h, + const index_t round_w, + const int wino_blk_size, + const bool input_changed, + Tensor *output_tensor, + uint32_t *kwg_size, + StatsFuture *future) { + OpenCLRuntime *runtime = context->device()->opencl_runtime(); + const index_t out_width = output_tensor->dim(2); + + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernel->get() == nullptr) { + std::string obfuscated_kernel_name; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + if (wino_blk_size == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); + built_options.emplace("-Dwinograd_transform_4x4=" + + obfuscated_kernel_name); + } else if (wino_blk_size == 2) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + built_options.emplace("-Dwinograd_transform_2x2=" + + obfuscated_kernel_name); + } else { + MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); + return MaceStatus::MACE_SUCCESS; + } + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + kernel)); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + + const uint32_t gws[2] = { + static_cast(out_width), + static_cast(RoundUpDiv4(input_tensor->dim(3))) + }; + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_2D_GWS_ARGS(*kernel, gws); + kernel->setArg(idx++, *(input_tensor->opencl_image())); + kernel->setArg(idx++, *(output_tensor->opencl_image())); + kernel->setArg(idx++, static_cast(input_tensor->dim(1))); + kernel->setArg(idx++, static_cast(input_tensor->dim(2))); + kernel->setArg(idx++, static_cast(input_tensor->dim(3))); + kernel->setArg(idx++, static_cast(round_h * round_w)); + kernel->setArg(idx++, static_cast(round_w)); + kernel->setArg(idx++, static_cast(paddings[0] / 2)); + kernel->setArg(idx++, static_cast(paddings[1] / 2)); + } + + + const std::vector lws = {*kwg_size / 8, 8, 0}; + std::string tuning_key = Concat("winograd_transform_kernel", + output_tensor->dim(0), + output_tensor->dim(1), + output_tensor->dim(2)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus WinogradOutputTransform(OpContext *context, + cl::Kernel *kernel, + const Tensor *input_tensor, + const Tensor *bias, + const DataType dt, + const index_t round_h, + const index_t round_w, + const int wino_blk_size, + const ActivationType activation, + const float relux_max_limit, + const bool input_changed, + Tensor *output_tensor, + uint32_t *kwg_size, + StatsFuture *future) { + OpenCLRuntime *runtime = context->device()->opencl_runtime(); + auto &output_shape = output_tensor->shape(); + + MACE_OUT_OF_RANGE_DEFINITION; + if (kernel->get() == nullptr) { + std::string obfuscated_kernel_name; + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + if (wino_blk_size == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); + built_options.emplace("-Dwinograd_inverse_transform_4x4=" + + obfuscated_kernel_name); + } else if (wino_blk_size == 2) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + built_options.emplace("-Dwinograd_inverse_transform_2x2=" + + obfuscated_kernel_name); + } else { + MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); + return MaceStatus::MACE_SUCCESS; + } + + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + default: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options, + kernel)); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); + } + + const uint32_t gws[2] = { + static_cast(input_tensor->dim(2)), + static_cast(RoundUpDiv4(input_tensor->dim(1)))}; + MACE_OUT_OF_RANGE_INIT(*kernel); + if (input_changed) { + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(*kernel); + MACE_SET_2D_GWS_ARGS(*kernel, gws); + kernel->setArg( + idx++, + *(static_cast(input_tensor->opencl_image()))); + if (bias != nullptr) { + kernel->setArg(idx++, + *(static_cast(bias->opencl_image()))); + } + kernel->setArg( + idx++, *(static_cast(output_tensor->opencl_image()))); + kernel->setArg(idx++, static_cast(output_shape[1])); + kernel->setArg(idx++, static_cast(output_shape[2])); + kernel->setArg(idx++, static_cast(round_h * round_w)); + kernel->setArg(idx++, static_cast(round_w)); + kernel->setArg(idx++, relux_max_limit); + } + const std::vector lws = {*kwg_size / 8, 8, 0}; + std::string tuning_key = + Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), + output_tensor->dim(1), output_tensor->dim(2), + output_tensor->dim(3), input_tensor->dim(2)); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, + gws, lws, future)); + + MACE_OUT_OF_RANGE_VALIDATION; + return MaceStatus::MACE_SUCCESS; +} +} // namespace + + +extern MaceStatus WinogradConv2dK3x3S1(OpContext *context, + cl::Kernel *kernels[3], + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const int *paddings, + const ActivationType activation, + const float relux_max_limit, + const DataType dt, + const int wino_blk_size, + std::vector *prev_input_shape, + Tensor *output, + uint32_t *kwg_size[3]) { + OpenCLRuntime *runtime = context->device()->opencl_runtime(); + ScratchImageManager *scratch_manager = runtime->scratch_image_manager(); + StatsFuture t_input_future, mm_future, t_output_future; + bool input_changed = !IsVecEqual(*prev_input_shape, input->shape()); + *prev_input_shape = input->shape(); + + auto output_shape = output->shape(); + const index_t round_h = + (output_shape[1] + wino_blk_size - 1) / wino_blk_size; + const index_t round_w = + (output_shape[2] + wino_blk_size - 1) / wino_blk_size; + const index_t out_width = input->dim(0) * round_h * round_w; + + const index_t blk_sqr = (wino_blk_size + 2) * (wino_blk_size + 2); + + index_t in_channel = input->dim(3); + index_t out_channel = output->dim(3); + + // 0. transform input + // input(NHWC) -> t_input(blk_sqr, in_channel, out_width) + std::vector t_input_shape = + {blk_sqr, in_channel, out_width}; + std::vector padded_t_input_shape = { + t_input_shape[0], t_input_shape[1], t_input_shape[2], 1 + }; + std::vector t_input_image_shape; + OpenCLUtil::CalImage2DShape(padded_t_input_shape, + OpenCLBufferType::IN_OUT_HEIGHT, + &t_input_image_shape); + ScratchImage transformed_input_image(scratch_manager); + std::unique_ptr transformed_input(new Tensor( + transformed_input_image.Scratch(context->device()->allocator(), + t_input_image_shape, dt), dt)); + MACE_RETURN_IF_ERROR(transformed_input->ResizeImage(t_input_shape, + t_input_image_shape)); + MACE_RETURN_IF_ERROR(WinogradInputTransform( + context, kernels[0], input, dt, paddings, + round_h, round_w, wino_blk_size, + input_changed, transformed_input.get(), + kwg_size[0], &t_input_future)); + + // 1. mat mul + // t_filter(blk_sqr, out_chan, in_chan)*t_input(blk_sqr, in_chan, out_width) + // -> t_output (blk_sqr, out_chan, out_width) + std::vector mm_output_shape = + {blk_sqr, out_channel, out_width}; + + std::vector padded_mm_output_shape = + {mm_output_shape[0], mm_output_shape[1], mm_output_shape[2], 1}; + std::vector mm_output_image_shape; + OpenCLUtil::CalImage2DShape(padded_mm_output_shape, + OpenCLBufferType::IN_OUT_HEIGHT, + &mm_output_image_shape); + + ScratchImage mm_output_image(scratch_manager); + std::unique_ptr mm_output(new Tensor( + mm_output_image.Scratch(context->device()->allocator(), + mm_output_image_shape, dt), dt)); + MACE_RETURN_IF_ERROR(mm_output->ResizeImage(mm_output_shape, + mm_output_image_shape)); + + const index_t height_blocks = RoundUpDiv4(mm_output_shape[1]); + const index_t width_blocks = RoundUpDiv4(mm_output_shape[2]); + const uint32_t gws[2] = { + static_cast(width_blocks), + static_cast(height_blocks * blk_sqr), + }; + + MACE_OUT_OF_RANGE_DEFINITION; + + if (kernels[1]->get() == nullptr) { + std::set built_options; + MACE_OUT_OF_RANGE_CONFIG; + MACE_NON_UNIFORM_WG_CONFIG; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); + built_options.emplace("-Dmatmul=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt)); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("matmul", kernel_name, + built_options, kernels[1])); + + *kwg_size[1] = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernels[1])); + } + MACE_OUT_OF_RANGE_INIT(*kernels[1]); + uint32_t idx = 0; + MACE_OUT_OF_RANGE_SET_ARGS(*kernels[1]); + MACE_SET_2D_GWS_ARGS(*kernels[1], gws); + kernels[1]->setArg(idx++, *(filter->opencl_image())); + kernels[1]->setArg(idx++, *(transformed_input->opencl_image())); + kernels[1]->setArg(idx++, *(mm_output->opencl_image())); + kernels[1]->setArg(idx++, static_cast(mm_output_shape[1])); + kernels[1]->setArg(idx++, static_cast(mm_output_shape[2])); + kernels[1]->setArg(idx++, static_cast(in_channel)); + kernels[1]->setArg(idx++, static_cast(height_blocks)); + kernels[1]->setArg(idx++, static_cast(RoundUpDiv4(in_channel))); + + const std::vector lws = {*kwg_size[1] / 64, 64, 0}; + std::string tuning_key = Concat("matmul_opencl_kernel", mm_output_shape[0], + mm_output_shape[1], mm_output_shape[2]); + MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernels[1], tuning_key, + gws, lws, &mm_future)); + + MACE_OUT_OF_RANGE_VALIDATION; + + // 2. transform output + // t_output (blk_sqr, out_chan, out_width) -> output(NHWC) + MACE_RETURN_IF_ERROR(WinogradOutputTransform( + context, kernels[2], mm_output.get(), bias, + dt, round_h, round_w, wino_blk_size, activation, relux_max_limit, + input_changed, output, kwg_size[2], &t_output_future)) + + MergeMultipleFutureWaitFn({t_input_future, mm_future, t_output_future}, + context->future()); + return MaceStatus::MACE_SUCCESS; +} + +} // namespace image +} // namespace opencl +} // namespace ops +} // namespace mace diff --git a/mace/ops/opencl/image/winograd_transform.h b/mace/ops/opencl/image/winograd_transform.h deleted file mode 100644 index a555322dfc6327fbfd3d1f6e448af8b649724901..0000000000000000000000000000000000000000 --- a/mace/ops/opencl/image/winograd_transform.h +++ /dev/null @@ -1,316 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#ifndef MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ -#define MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ - -#include "mace/ops/opencl/winograd_transform.h" - -#include -#include -#include -#include - -#include "mace/core/op_context.h" -#include "mace/core/tensor.h" -#include "mace/ops/activation.h" -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/opencl/helper.h" - -namespace mace { -namespace ops { -namespace opencl { -namespace image { - -template -class WinogradTransformKernel : public OpenCLWinogradTransformKernel { - public: - WinogradTransformKernel( - Padding padding_type, - const std::vector &paddings, - const int block_size) - : strides_({1, 1}), - dilations_({1, 1}), - padding_type_(padding_type), - paddings_(paddings), - wino_blk_size_(block_size) {} - MaceStatus Compute( - OpContext *context, - const Tensor *input_tensor, - Tensor *output_tensor) override; - - private: - const std::vector strides_; // [stride_h, stride_w] - const std::vector dilations_; // [dilation_h, dilation_w] - Padding padding_type_; - std::vector paddings_; - const int wino_blk_size_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::vector input_shape_; -}; - -template -MaceStatus WinogradTransformKernel::Compute( - OpContext *context, - const Tensor *input_tensor, - Tensor *output_tensor) { - auto runtime = context->device()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - if (wino_blk_size_ == 4) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); - built_options.emplace("-Dwinograd_transform_4x4=" - + obfuscated_kernel_name); - } else if (wino_blk_size_ == 2) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); - built_options.emplace("-Dwinograd_transform_2x2=" - + obfuscated_kernel_name); - } else { - MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MaceStatus::MACE_SUCCESS; - } - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - std::vector output_shape(4); - std::vector filter_shape = {1, input_tensor->dim(3), 3, 3}; - std::vector paddings(2); - if (paddings_.empty()) { - ops::CalcNHWCPaddingAndOutputSize( - input_tensor->shape().data(), filter_shape.data(), dilations_.data(), - strides_.data(), padding_type_, output_shape.data(), paddings.data()); - } else { - paddings = paddings_; - CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), - paddings_.data(), dilations_.data(), strides_.data(), - RoundType::FLOOR, output_shape.data()); - } - const index_t round_h = - (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_; - const index_t round_w = - (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; - const index_t out_width = input_tensor->dim(0) * round_h * round_w; - - const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); - - const uint32_t gws[2] = { - static_cast(out_width), - static_cast(RoundUpDiv4(input_tensor->dim(3))) - }; - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input_tensor->shape())) { - output_shape = {blk_sqr, input_tensor->dim(3), out_width}; - std::vector padded_output_shape = { - output_shape[0], output_shape[1], output_shape[2], 1 - }; - std::vector image_shape; - CalImage2DShape(padded_output_shape, - BufferType::IN_OUT_HEIGHT, - &image_shape); - // remove unused last dimension - MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg(idx++, *(input_tensor->opencl_image())); - kernel_.setArg(idx++, *(output_tensor->opencl_image())); - kernel_.setArg(idx++, static_cast(input_tensor->dim(1))); - kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); - kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); - kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, static_cast(paddings[0] / 2)); - kernel_.setArg(idx++, static_cast(paddings[1] / 2)); - - input_shape_ = input_tensor->shape(); - } - - - const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::string tuning_key = Concat("winograd_transform_kernel", - output_tensor->dim(0), - output_tensor->dim(1), - output_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} - -template -class WinogradInverseTransformKernel - : public OpenCLWinogradInverseTransformKernel { - public: - WinogradInverseTransformKernel( - ActivationType activation, - const float relux_max_limit, - const int block_size) - : wino_blk_size_(block_size), - activation_(activation), - relux_max_limit_(relux_max_limit) {} - MaceStatus Compute( - OpContext *context, - const std::vector &inputs, - Tensor *output_tensor) override; - - private: - const int wino_blk_size_; - const ActivationType activation_; - const float relux_max_limit_; - cl::Kernel kernel_; - uint32_t kwg_size_; - std::vector input_shape_; -}; - -template -MaceStatus WinogradInverseTransformKernel::Compute( - OpContext *context, - const std::vector &inputs, - Tensor *output_tensor) { - auto runtime = context->device()->opencl_runtime(); - MACE_OUT_OF_RANGE_DEFINITION; - - const Tensor *input_tensor = inputs[0]; - const Tensor *bias = inputs.size() == 3 ? inputs[2] : nullptr; - - if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name; - std::set built_options; - MACE_OUT_OF_RANGE_CONFIG; - MACE_NON_UNIFORM_WG_CONFIG; - if (wino_blk_size_ == 4) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); - built_options.emplace("-Dwinograd_inverse_transform_4x4=" - + obfuscated_kernel_name); - } else if (wino_blk_size_ == 2) { - obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); - built_options.emplace("-Dwinograd_inverse_transform_2x2=" - + obfuscated_kernel_name); - } else { - MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd."); - return MaceStatus::MACE_SUCCESS; - } - - built_options.emplace("-DDATA_TYPE=" + - DtToUpCompatibleCLDt(DataTypeToEnum::value)); - built_options.emplace("-DCMD_DATA_TYPE=" + - DtToUpCompatibleCLCMDDt(DataTypeToEnum::value)); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - switch (activation_) { - case NOOP: - break; - case RELU: - built_options.emplace("-DUSE_RELU"); - break; - case RELUX: - built_options.emplace("-DUSE_RELUX"); - break; - case PRELU: - built_options.emplace("-DUSE_PRELU"); - break; - case TANH: - built_options.emplace("-DUSE_TANH"); - break; - case SIGMOID: - built_options.emplace("-DUSE_SIGMOID"); - break; - default: - LOG(FATAL) << "Unknown activation type: " << activation_; - } - - MACE_RETURN_IF_ERROR(runtime->BuildKernel("winograd_transform", - obfuscated_kernel_name, - built_options, - &kernel_)); - - kwg_size_ = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - } - - Tensor::MappingGuard output_shape_guard(inputs[1]); - const int32_t *output_shape_data = inputs[1]->data(); - const index_t batch = output_shape_data[0]; - const index_t height = output_shape_data[1]; - const index_t width = output_shape_data[2]; - const uint32_t gws[2] = { - static_cast(input_tensor->dim(2)), - static_cast(RoundUpDiv4(input_tensor->dim(1)))}; - MACE_OUT_OF_RANGE_INIT(kernel_); - if (!IsVecEqual(input_shape_, input_tensor->shape())) { - std::vector output_shape = {batch, height, width, - input_tensor->dim(1)}; - std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); - MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); - - const index_t round_h = (height + wino_blk_size_ - 1) / wino_blk_size_; - const index_t round_w = (width + wino_blk_size_ - 1) / wino_blk_size_; - - uint32_t idx = 0; - MACE_OUT_OF_RANGE_SET_ARGS(kernel_); - MACE_SET_2D_GWS_ARGS(kernel_, gws); - kernel_.setArg( - idx++, - *(static_cast(input_tensor->opencl_image()))); - if (bias != nullptr) { - kernel_.setArg(idx++, - *(static_cast(bias->opencl_image()))); - } - kernel_.setArg( - idx++, *(static_cast(output_tensor->opencl_image()))); - kernel_.setArg(idx++, static_cast(output_shape[1])); - kernel_.setArg(idx++, static_cast(output_shape[2])); - kernel_.setArg(idx++, static_cast(round_h * round_w)); - kernel_.setArg(idx++, static_cast(round_w)); - kernel_.setArg(idx++, relux_max_limit_); - - input_shape_ = input_tensor->shape(); - } - const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::string tuning_key = - Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), - output_tensor->dim(1), output_tensor->dim(2), - output_tensor->dim(3), input_tensor->dim(2)); - MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key, - gws, lws, context->future())); - - MACE_OUT_OF_RANGE_VALIDATION; - return MaceStatus::MACE_SUCCESS; -} -} // namespace image -} // namespace opencl -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_ diff --git a/mace/ops/opencl/out_of_range_check_test.cc b/mace/ops/opencl/out_of_range_check_test.cc index f63d1db056e5ace6eb10cf37ba8cefa08d4f1bac..eb2236931b08561715ef08e3e3194084261004d8 100644 --- a/mace/ops/opencl/out_of_range_check_test.cc +++ b/mace/ops/opencl/out_of_range_check_test.cc @@ -144,7 +144,9 @@ TEST(OutOfRangeCheckTest, RandomTest) { std::vector image_shape; Tensor *image = ws.CreateTensor("Image", device->allocator(), DataTypeToEnum::v()); - CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape); + OpenCLUtil::CalImage2DShape(buffer->shape(), + OpenCLBufferType::IN_OUT_CHANNEL, + &image_shape); image->ResizeImage(buffer->shape(), image_shape); ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape) != MaceStatus::MACE_SUCCESS); diff --git a/mace/ops/opencl/winograd_transform.h b/mace/ops/opencl/winograd_transform.h deleted file mode 100644 index f150481a7cacd173fcec7bb0a705206acebc6c45..0000000000000000000000000000000000000000 --- a/mace/ops/opencl/winograd_transform.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ -#define MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ - -#include - -#include "mace/public/mace.h" -#include "mace/utils/utils.h" -namespace mace { - -class OpContext; -class Tensor; - -namespace ops { - -class OpenCLWinogradTransformKernel { - public: - virtual MaceStatus Compute( - OpContext *context, - const Tensor *input, - Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel); -}; - -class OpenCLWinogradInverseTransformKernel { - public: - virtual MaceStatus Compute( - OpContext *context, - const std::vector &inputs, - Tensor *output) = 0; - MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel); -}; - -} // namespace ops -} // namespace mace - -#endif // MACE_OPS_OPENCL_WINOGRAD_TRANSFORM_H_ diff --git a/mace/ops/ops_registry.cc b/mace/ops/ops_registry.cc index 48a893760a8789fb8f831774726332d55e17a922..7407683d6464ea2559eca1d55ee548bd4e3c75dc 100644 --- a/mace/ops/ops_registry.cc +++ b/mace/ops/ops_registry.cc @@ -69,10 +69,7 @@ extern void RegisterQuantize(OpRegistryBase *op_registry); #ifdef MACE_ENABLE_OPENCL extern void RegisterBufferTransform(OpRegistryBase *op_registry); -extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry); extern void RegisterLSTMCell(OpRegistryBase *op_registry); -extern void RegisterWinogradInverseTransform(OpRegistryBase *op_registry); -extern void RegisterWinogradTransform(OpRegistryBase *op_registry); #endif // MACE_ENABLE_OPENCL } // namespace ops @@ -130,10 +127,7 @@ OpRegistry::OpRegistry() : OpRegistryBase() { #ifdef MACE_ENABLE_OPENCL ops::RegisterBufferTransform(this); - ops::RegisterBufferInverseTransform(this); ops::RegisterLSTMCell(this); - ops::RegisterWinogradInverseTransform(this); - ops::RegisterWinogradTransform(this); #endif // MACE_ENABLE_OPENCL } diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index a41e5b0947b59718a9ef275b9971eb71726e2f17..21407c6a743491820d431e077d01e30aa629ac9b 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -13,11 +13,93 @@ // limitations under the License. #include "mace/ops/ops_test_util.h" +#include "mace/core/memory_optimizer.h" namespace mace { namespace ops { namespace test { + +OpDefBuilder::OpDefBuilder(const char *type, const std::string &name) { + op_def_.set_type(type); + op_def_.set_name(name); +} + +OpDefBuilder &OpDefBuilder::Input(const std::string &input_name) { + op_def_.add_input(input_name); + return *this; +} + +OpDefBuilder &OpDefBuilder::Output(const std::string &output_name) { + op_def_.add_output(output_name); + return *this; +} + +OpDefBuilder &OpDefBuilder::OutputType( + const std::vector &output_type) { + for (auto out_t : output_type) { + op_def_.add_output_type(out_t); + } + return *this; +} + +OpDefBuilder &OpDefBuilder::OutputShape( + const std::vector &output_shape) { + auto shape = op_def_.add_output_shape(); + for (auto s : output_shape) { + shape->add_dims(s); + } + return *this; +} + +OpDefBuilder OpDefBuilder::AddIntArg(const std::string &name, const int value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_i(value); + return *this; +} + +OpDefBuilder OpDefBuilder::AddFloatArg(const std::string &name, + const float value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_f(value); + return *this; +} + +OpDefBuilder OpDefBuilder::AddStringArg(const std::string &name, + const char *value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_s(value); + return *this; +} + +OpDefBuilder OpDefBuilder::AddIntsArg(const std::string &name, + const std::vector &values) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + for (auto value : values) { + arg->add_ints(value); + } + return *this; +} + +OpDefBuilder OpDefBuilder::AddFloatsArg(const std::string &name, + const std::vector &values) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + for (auto value : values) { + arg->add_floats(value); + } + return *this; +} + +void OpDefBuilder::Finalize(OperatorDef *op_def) const { + MACE_CHECK(op_def != nullptr, "input should not be null."); + *op_def = op_def_; +} + OpTestContext *OpTestContext::Get(int num_threads, CPUAffinityPolicy cpu_affinity_policy, bool use_gemmlowp) { @@ -67,6 +149,100 @@ void OpTestContext::SetOCLImageAndBufferTestFlag() { opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER}; } +bool OpsTestNet::Setup(mace::DeviceType device) { + NetDef net_def; + for (auto &op_def_ : op_defs_) { + net_def.add_op()->CopyFrom(op_def_); + + for (auto input : op_def_.input()) { + if (ws_.GetTensor(input) != nullptr && + !ws_.GetTensor(input)->is_weight()) { + auto input_info = net_def.add_input_info(); + input_info->set_name(input); + auto &shape = ws_.GetTensor(input)->shape(); + for (auto d : shape) { + input_info->add_dims(static_cast(d)); + } + } + } + + for (auto output : op_def_.output()) { + ws_.RemoveTensor(output); + auto output_info = net_def.add_output_info(); + output_info->set_name(output); + } + } + MemoryOptimizer mem_optimizer; + net_ = std::unique_ptr(new SerialNet( + op_registry_.get(), + &net_def, + &ws_, + OpTestContext::Get()->GetDevice(device), + &mem_optimizer)); + MaceStatus status = (ws_.PreallocateOutputTensor( + net_def, + &mem_optimizer, + OpTestContext::Get()->GetDevice(device))); + if (status != MaceStatus::MACE_SUCCESS) return false; + status = net_->Init(); + device_type_ = device; + return status == MaceStatus::MACE_SUCCESS; +} + +MaceStatus OpsTestNet::Run() { + MACE_CHECK_NOTNULL(net_); + MACE_RETURN_IF_ERROR(net_->Run()); + Sync(); + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { + if (device == DeviceType::GPU) { + auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); + for (auto type : opencl_mem_types) { + OpTestContext::Get()->GetDevice(device) + ->opencl_runtime()->set_mem_type(type); + Setup(device); + MACE_RETURN_IF_ERROR(Run()); + } + return MaceStatus::MACE_SUCCESS; + } else { + Setup(device); + return Run(); + } +} + +MaceStatus OpsTestNet::RunOp() { + return RunOp(DeviceType::CPU); +} + +MaceStatus OpsTestNet::RunNet(const mace::NetDef &net_def, + const mace::DeviceType device) { + device_type_ = device; + MemoryOptimizer mem_optimizer; + net_ = std::unique_ptr(new SerialNet( + op_registry_.get(), + &net_def, + &ws_, + OpTestContext::Get()->GetDevice(device), + &mem_optimizer)); + MACE_RETURN_IF_ERROR(ws_.PreallocateOutputTensor( + net_def, + &mem_optimizer, + OpTestContext::Get()->GetDevice(device))); + MACE_RETURN_IF_ERROR(net_->Init()); + return net_->Run(); +} + +void OpsTestNet::Sync() { +#ifdef MACE_ENABLE_OPENCL + if (net_ && device_type_ == DeviceType::GPU) { + OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime() + ->command_queue().finish(); + } +#endif +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 5bf842f38be4b7612669c122a8853be7dbb4537a..0596119194b30850eb2aca8492a23e86e7efc9d3 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -29,9 +29,9 @@ #include "mace/core/net.h" #include "mace/core/device_context.h" #include "mace/core/runtime/opencl/gpu_device.h" +#include "mace/core/runtime/opencl/opencl_util.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" -#include "mace/ops/opencl/common.h" #include "mace/ops/ops_registry.h" #include "mace/public/mace.h" #include "mace/utils/utils.h" @@ -43,73 +43,29 @@ namespace test { class OpDefBuilder { public: - OpDefBuilder(const char *type, const std::string &name) { - op_def_.set_type(type); - op_def_.set_name(name); - } + OpDefBuilder(const char *type, const std::string &name); - OpDefBuilder &Input(const std::string &input_name) { - op_def_.add_input(input_name); - return *this; - } + OpDefBuilder &Input(const std::string &input_name); - OpDefBuilder &Output(const std::string &output_name) { - op_def_.add_output(output_name); - return *this; - } + OpDefBuilder &Output(const std::string &output_name); - OpDefBuilder &OutputType(const std::vector &output_type) { - for (auto out_t : output_type) { - op_def_.add_output_type(out_t); - } - return *this; - } + OpDefBuilder &OutputType(const std::vector &output_type); - OpDefBuilder AddIntArg(const std::string &name, const int value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_i(value); - return *this; - } + OpDefBuilder &OutputShape(const std::vector &output_shape); - OpDefBuilder AddFloatArg(const std::string &name, const float value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_f(value); - return *this; - } + OpDefBuilder AddIntArg(const std::string &name, const int value); - OpDefBuilder AddStringArg(const std::string &name, const char *value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_s(value); - return *this; - } + OpDefBuilder AddFloatArg(const std::string &name, const float value); + + OpDefBuilder AddStringArg(const std::string &name, const char *value); OpDefBuilder AddIntsArg(const std::string &name, - const std::vector &values) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - for (auto value : values) { - arg->add_ints(value); - } - return *this; - } + const std::vector &values); OpDefBuilder AddFloatsArg(const std::string &name, - const std::vector &values) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - for (auto value : values) { - arg->add_floats(value); - } - return *this; - } + const std::vector &values); - void Finalize(OperatorDef *op_def) const { - MACE_CHECK(op_def != nullptr, "input should not be null."); - *op_def = op_def_; - } + void Finalize(OperatorDef *op_def) const; OperatorDef op_def_; }; @@ -146,11 +102,12 @@ class OpsTestNet { void AddInputFromArray(const std::string &name, const std::vector &shape, const std::vector &data, + bool is_weight = false, const float scale = 0.0, const int32_t zero_point = 0) { Tensor *input = ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); + DataTypeToEnum::v(), is_weight); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -163,10 +120,11 @@ class OpsTestNet { template void AddRepeatedInput(const std::string &name, const std::vector &shape, - const T data) { + const T data, + bool is_weight = false) { Tensor *input = ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); + DataTypeToEnum::v(), is_weight); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -176,13 +134,14 @@ class OpsTestNet { template void AddRandomInput(const std::string &name, const std::vector &shape, + bool is_weight = false, bool positive = true, bool truncate = false, const float truncate_min = 0.001f, const float truncate_max = 100.f) { Tensor *input = ws_.CreateTensor(name, OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); + DataTypeToEnum::v(), is_weight); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -220,28 +179,6 @@ class OpsTestNet { } } - template - void Transpose2D(const std::string &src_name, const std::string &dst_name) { - Tensor *input = ws_.GetTensor(src_name); - Tensor *output = ws_.CreateTensor( - dst_name, - OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); - const std::vector input_shape = input->shape(); - MACE_CHECK(input_shape.size() == 2, "input shape != 2"); - output->Resize({input_shape[1], input_shape[0]}); - Tensor::MappingGuard input_guard(input); - Tensor::MappingGuard output_guard(output); - const T *input_data = input->data(); - T *output_data = output->mutable_data(); - for (index_t i = 0; i < input_shape[0]; ++i) { - for (index_t j = 0; j < input_shape[1]; ++j) { - output_data[j * input_shape[0] + i] = - input_data[i * input_shape[1] + j]; - } - } - } - template void CopyData(const std::string &src_name, const std::string &dst_name) { @@ -249,7 +186,8 @@ class OpsTestNet { Tensor *output = ws_.CreateTensor( dst_name, OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); + DataTypeToEnum::v(), + input->is_weight()); const std::vector input_shape = input->shape(); output->Resize(input_shape); @@ -267,7 +205,8 @@ class OpsTestNet { Tensor *output = ws_.CreateTensor( dst_name, OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); + DataTypeToEnum::v(), + input->is_weight()); const std::vector input_shape = input->shape(); MACE_CHECK(input_shape.size() == 4, "input shape != 4"); @@ -311,7 +250,25 @@ class OpsTestNet { } } } - } else if (src_format == HWOI && dst_format == OIHW) { + } else { + MACE_NOT_IMPLEMENTED; + } + } + + template + void TransformFilterDataFormat(const std::string &src_name, + const FilterDataFormat src_format, + const std::string &dst_name, + const FilterDataFormat dst_format) { + Tensor *input = ws_.GetTensor(src_name); + Tensor *output = ws_.CreateTensor( + dst_name, + OpTestContext::Get()->GetDevice(D)->allocator(), + DataTypeToEnum::v(), + input->is_weight()); + const std::vector input_shape = input->shape(); + MACE_CHECK(input_shape.size() == 4, "input shape != 4"); + if (src_format == HWOI && dst_format == OIHW) { index_t height = input_shape[0]; index_t width = input_shape[1]; index_t out_channels = input_shape[2]; @@ -392,34 +349,6 @@ class OpsTestNet { } } - template - void FillNHWCInputToNCHWInput(const std::string &name_nchw, - const std::string &name_nhwc) { - Tensor *input = ws_.GetTensor(name_nhwc); - Tensor *output = ws_.CreateTensor( - name_nchw, - OpTestContext::Get()->GetDevice(D)->allocator(), - DataTypeToEnum::v()); - const std::vector input_shape = input->shape(); - index_t batch = input_shape[0]; - index_t height = input_shape[1]; - index_t width = input_shape[2]; - index_t channels = input_shape[3]; - output->Resize({batch, channels, height, width}); - const T *input_data = input->data(); - T *output_data = output->mutable_data(); - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channels; ++c) { - for (index_t h = 0; h < height; ++h) { - for (index_t w = 0; w < width; ++w) { - output_data[((b * channels + c) * height + h) * width + w] = - input_data[((b * height + h) * width + w) * channels + c]; - } - } - } - } - } - // Create standalone tensor on device D with T type. template std::unique_ptr CreateTensor( @@ -447,89 +376,33 @@ class OpsTestNet { return &op_defs_[op_defs_.size() - 1]; } - Workspace *ws() { return &ws_; } + inline Workspace *ws() { return &ws_; } - bool Setup(DeviceType device) { - NetDef net_def; - for (auto &op_def_ : op_defs_) { - net_def.add_op()->CopyFrom(op_def_); - } - net_ = std::unique_ptr(new SerialNet( - op_registry_.get(), - &net_def, - &ws_, - OpTestContext::Get()->GetDevice(device))); - MaceStatus status = net_->Init(); - device_type_ = device; - return status == MaceStatus::MACE_SUCCESS; - } + bool Setup(DeviceType device); - MaceStatus Run() { - MACE_CHECK_NOTNULL(net_); - MACE_RETURN_IF_ERROR(net_->Run()); - Sync(); - return MaceStatus::MACE_SUCCESS; - } + MaceStatus Run(); // DEPRECATED(liyin): // Test and benchmark should setup model once and run multiple times. // Setup time should not be counted during benchmark. - MaceStatus RunOp(DeviceType device) { - if (device == DeviceType::GPU) { - auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); - for (auto type : opencl_mem_types) { - OpTestContext::Get()->GetDevice(device) - ->opencl_runtime()->set_mem_type(type); - Setup(device); - MACE_RETURN_IF_ERROR(Run()); - } - return MaceStatus::MACE_SUCCESS; - } else { - Setup(device); - return Run(); - } - } + MaceStatus RunOp(DeviceType device); // DEPRECATED(liyin): // Test and benchmark should setup model once and run multiple times. // Setup time should not be counted during benchmark. - MaceStatus RunOp() { return RunOp(DeviceType::CPU); } - - MaceStatus RunNet(const NetDef &net_def, const DeviceType device) { - device_type_ = device; - auto net = std::unique_ptr(new SerialNet( - op_registry_.get(), - &net_def, - &ws_, - OpTestContext::Get()->GetDevice(device), - NetMode::INIT)); - MACE_RETURN_IF_ERROR(net->Init()); - MACE_RETURN_IF_ERROR(net->Run()); - net_ = std::unique_ptr(new SerialNet( - op_registry_.get(), - &net_def, - &ws_, - OpTestContext::Get()->GetDevice(device))); - MACE_RETURN_IF_ERROR(net_->Init()); - return net_->Run(); - } + MaceStatus RunOp(); + + MaceStatus RunNet(const NetDef &net_def, const DeviceType device); - Tensor *GetOutput(const char *output_name) { + inline Tensor *GetOutput(const char *output_name) { return ws_.GetTensor(output_name); } - Tensor *GetTensor(const char *tensor_name) { + inline Tensor *GetTensor(const char *tensor_name) { return ws_.GetTensor(tensor_name); } - void Sync() { -#ifdef MACE_ENABLE_OPENCL - if (net_ && device_type_ == DeviceType::GPU) { - OpTestContext::Get()->GetDevice(DeviceType::GPU)->opencl_runtime() - ->command_queue().finish(); - } -#endif - } + void Sync(); public: std::shared_ptr op_registry_; @@ -773,50 +646,6 @@ void ExpectTensorSimilar(const Tensor &x, EXPECT_NEAR(1.0, similarity, abs_err); } -template -void BufferToImage(OpsTestNet *net, - const std::string &input_name, - const std::string &output_name, - const ops::BufferType type, - const int wino_block_size = 2) { - MACE_CHECK_NOTNULL(net); - - OpDefBuilder("BufferTransform", "BufferTransformTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("wino_block_size", wino_block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); - - // TODO(liuqi): Use AddNewOperatorDef, and run all ops with same NetDef. - net->RunOp(D); - - net->Sync(); -} - -template -void ImageToBuffer(OpsTestNet *net, - const std::string &input_name, - const std::string &output_name, - const ops::BufferType type, - const int wino_block_size = 2) { - MACE_CHECK_NOTNULL(net); - - OpDefBuilder("BufferInverseTransform", "BufferInverseTransformTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("wino_block_size", wino_block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); - - // Run - net->RunOp(D); - - net->Sync(); -} - } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc index 60bfbc192c6f50a05ca68dd0ad6d82d12182080d..cb7979063097a07be88337b5b14db63a7ffe99f4 100644 --- a/mace/ops/pad.cc +++ b/mace/ops/pad.cc @@ -33,7 +33,10 @@ class PadOp : public Operation { : Operation(context), paddings_(Operation::GetRepeatedArgs("paddings")), constant_value_(Operation::GetOptionalArg( - "constant_value", 0.0)) {} + "constant_value", 0.0)) { + MACE_CHECK(paddings_.size() == 8); + paddings_ = TransposeShape(paddings_, {0, 1, 6, 7, 2, 3, 4, 5}); + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc index e295d2ad8799cc52d4b9c9ed52218a9a85ddd0cc..fb7f4e14426677b1ee26bf0ba3459ea5043074ea 100644 --- a/mace/ops/pad_benchmark.cc +++ b/mace/ops/pad_benchmark.cc @@ -31,23 +31,13 @@ void Pad(int iters, int batch, int height, net.AddRandomInput("Input", {batch, height, width, channels}); const std::vector paddings = {0, 0, pad, pad, pad, pad, 0, 0}; - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("Pad", "PadTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("paddings", paddings) - .AddFloatArg("constant_value", 1.0) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("Pad", "PadTest") - .Input("Input") - .Output("Output") - .AddIntsArg("paddings", paddings) - .AddFloatArg("constant_value", 1.0) - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("Pad", "PadTest") + .Input("Input") + .Output("Output") + .AddIntsArg("paddings", paddings) + .AddFloatArg("constant_value", 1.0) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc index f0eece255af09b8e9d44a16dfe5965e67d1503c0..5de799f243e9cc51fb541f6ad5c7601e5de34cc3 100644 --- a/mace/ops/pad_test.cc +++ b/mace/ops/pad_test.cc @@ -29,27 +29,22 @@ void Simple() { // Add input data net.AddRepeatedInput("Input", {1, 2, 3, 1}, 2); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pad", "PadTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } else { net.TransformDataFormat("Input", NHWC, "TInput", NCHW); OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") - .AddIntsArg("paddings", {0, 0, 0, 0, 1, 2, 1, 2}) + .AddIntsArg("paddings", {0, 0, 1, 2, 1, 2, 0, 0}) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); @@ -111,8 +106,7 @@ TEST_F(PadTest, ComplexCPU) { namespace { template void Complex(const std::vector &input_shape, - const std::vector &cpu_paddings, - const std::vector &gpu_paddings) { + const std::vector &paddings) { // Construct graph OpsTestNet net; @@ -124,7 +118,7 @@ void Complex(const std::vector &input_shape, OpDefBuilder("Pad", "PadTest") .Input("TInput") .Output("TOutput") - .AddIntsArg("paddings", cpu_paddings) + .AddIntsArg("paddings", paddings) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); @@ -136,22 +130,17 @@ void Complex(const std::vector &input_shape, auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pad", "PadTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("paddings", gpu_paddings) + .Input("Input") + .Output("Output") + .AddIntsArg("paddings", paddings) .AddFloatArg("constant_value", 1.0) .Finalize(net.NewOperatorDef()); // Run net.RunOp(DeviceType::GPU); - ImageToBuffer(&net, "OutputImage", "OpenCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - - auto output = net.GetTensor("OpenCLOutput"); + auto output = net.GetTensor("Output"); if (DataTypeToEnum::value == DT_HALF) { ExpectTensorNear(*expected, *output, 1e-2, 1e-2); @@ -162,21 +151,15 @@ void Complex(const std::vector &input_shape, } // namespace TEST_F(PadTest, ComplexFloat) { - Complex({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1}, - {0, 0, 2, 2, 1, 1, 0, 0}); - Complex({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0}, - {0, 0, 2, 0, 1, 0, 0, 0}); - Complex({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2}, - {0, 0, 0, 1, 0, 2, 0, 0}); + Complex({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0}); + Complex({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0}); + Complex({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0}); } TEST_F(PadTest, ComplexHalf) { - Complex({1, 32, 32, 4}, {0, 0, 0, 0, 2, 2, 1, 1}, - {0, 0, 2, 2, 1, 1, 0, 0}); - Complex({1, 31, 37, 16}, {0, 0, 0, 0, 2, 0, 1, 0}, - {0, 0, 2, 0, 1, 0, 0, 0}); - Complex({1, 128, 128, 32}, {0, 0, 0, 0, 0, 1, 0, 2}, - {0, 0, 0, 1, 0, 2, 0, 0}); + Complex({1, 32, 32, 4}, {0, 0, 2, 2, 1, 1, 0, 0}); + Complex({1, 31, 37, 16}, {0, 0, 2, 0, 1, 0, 0, 0}); + Complex({1, 128, 128, 32}, {0, 0, 0, 1, 0, 2, 0, 0}); } } // namespace test diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 2ce9d6acb6ac535311b5dc77e6161721a6c716cd..b2aef666266dfcd77b06047eab7891fd6cb82cef 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -432,6 +432,7 @@ class PoolingOp : public PoolingOpBase { if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::PoolingKernel); } else { + context->set_output_mem_type(MemoryType::GPU_BUFFER); kernel_.reset(new opencl::buffer::PoolingKernel); } } diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index ff915ec0f60f07d5626cbf931bb36806caea997c..c48cc8771fec57898dfe648abc7db7438bd5e330 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -52,8 +52,7 @@ void Pooling(int iters, MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("Pooling", "PoolingTest") + OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") .AddIntArg("pooling_type", pooling_type) @@ -63,22 +62,6 @@ void Pooling(int iters, .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("pooling_type", pooling_type) - .AddIntsArg("kernels", {kernel, kernel}) - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 99691db84f1ef4882e676ad335874d45ddda4a4e..6db144e4f8fc77f2b6d58219236c1edd439bf242 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -190,11 +190,9 @@ void SimpleMaxPooling3S2() { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {3, 3}) .AddIntsArg("strides", {2, 2}) @@ -202,8 +200,6 @@ void SimpleMaxPooling3S2() { .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); } // Check @@ -250,11 +246,9 @@ void MaxPooling3S2(const std::vector &input_shape, auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {3, 3}) .AddIntsArg("strides", strides) @@ -263,14 +257,12 @@ void MaxPooling3S2(const std::vector &input_shape, .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3, 1e-4); } else { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } } // namespace @@ -349,11 +341,9 @@ void SimpleAvgPoolingTest() { "Input", {1, 2, 8, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("pooling_type", PoolingType::AVG) .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) @@ -362,9 +352,6 @@ void SimpleAvgPoolingTest() { .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - // Check auto expected = net.CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); @@ -408,11 +395,9 @@ void AvgPoolingTest(const std::vector &shape, auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("pooling_type", PoolingType::AVG) .AddIntsArg("kernels", kernels) .AddIntsArg("strides", strides) @@ -421,14 +406,12 @@ void AvgPoolingTest(const std::vector &shape, .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-3, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-3, 1e-3); } else { - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } } // namespace @@ -578,12 +561,12 @@ void TestQuant(const index_t batch, OpsTestNet net; std::vector input_shape{batch, in_height, in_width, channels}; net.AddRandomInput( - "Input", input_shape, false); + "Input", input_shape, false, false); net.TransformDataFormat( "Input", NHWC, "InputNCHW", NCHW); net.AddRandomInput( - "OutputNCHW", input_shape, true, true); + "OutputNCHW", input_shape, false, true, true); OpDefBuilder("Pooling", "PoolingTest") .Input("InputNCHW") .Output("OutputNCHW") diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc index 9364146f267cabd203dc75989c129c58ba466b76..20f7e81c8b54165388de9f5fd2f359c4d42d1862 100644 --- a/mace/ops/reduce_mean.cc +++ b/mace/ops/reduce_mean.cc @@ -47,7 +47,7 @@ class ReduceMeanOpBase : public Operation { } protected: - const std::vector axis_; + std::vector axis_; bool keep_dims_; }; @@ -58,7 +58,8 @@ template class ReduceMeanOp : public ReduceMeanOpBase { public: explicit ReduceMeanOp(OpConstructContext *context) - : ReduceMeanOpBase(context) {} + : ReduceMeanOpBase(context) { + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); @@ -80,9 +81,15 @@ class ReduceMeanOp : public ReduceMeanOpBase { } } else { for (unsigned int i = 0; i < axis_.size(); ++i) { - const int index = axis_[i] >= 0 ? - axis_[i] : - axis_[i] + input->dim_size(); + int index = axis_[i] >= 0 ? + axis_[i] : + axis_[i] + input->dim_size(); + // axis format is NHWC + if (input->dim_size() == 4) { + if (index == 1) index = 2; + else if (index == 2) index = 3; + else if (index == 3) index = 1; + } bitmap[index] = true; } } diff --git a/mace/ops/reduce_mean_benchmark.cc b/mace/ops/reduce_mean_benchmark.cc index 24338ce77e3258af1f23f04a64ae57421f629a5e..60a255009c3b614c90aeb2607dc3c5e78ef2472e 100644 --- a/mace/ops/reduce_mean_benchmark.cc +++ b/mace/ops/reduce_mean_benchmark.cc @@ -27,26 +27,20 @@ void ReduceMean(int iters, int batch, int channels, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - + std::vector axis = {1, 2}; if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ReduceMean", "ReduceMeanBM") - .Input("InputImage") - .AddIntsArg("axis", {1, 2}) - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + net.AddRandomInput("Input", {batch, height, width, channels}); } else { - net.TransformDataFormat("Input", NHWC, "InputNCHW", - NCHW); - OpDefBuilder("ReduceMean", "ReduceMeanBM") - .Input("InputNCHW") - .AddIntsArg("axis", {2, 3}) - .Output("Output") - .Finalize(net.NewOperatorDef()); + net.AddRandomInput("Input", {batch, channels, height, width}); } + OpDefBuilder("ReduceMean", "ReduceMeanBM") + .Input("Input") + .AddIntsArg("axis", axis) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc index bc2577e29352f193d97403d6f221c94e033ddb2e..ef455f85a4cf0961fb24975b47fe88640d2e7150 100644 --- a/mace/ops/reduce_mean_test.cc +++ b/mace/ops/reduce_mean_test.cc @@ -34,32 +34,54 @@ void Simple(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input); if (D == DeviceType::CPU) { + net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); OpDefBuilder("ReduceMean", "ReduceMeanTest") - .Input("Input") + .Input("InputNCHW") .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) - .Output("Output") + .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); + net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } else { - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ReduceMean", "ReduceMeanTest") - .Input("InputImg") + .Input("Input") .AddIntsArg("axis", axis) .AddIntArg("keepdims", keepdims ? 1 : 0) - .Output("OutputImg") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "Output", - ops::BufferType::IN_OUT_CHANNEL); } auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); } +template +void Simple3D(const std::vector &input_shape, + const std::vector &input, + const std::vector &axis, + const std::vector &output_shape, + const std::vector &output, + const bool keepdims = true) { + // Construct graph + OpsTestNet net; + // Add input data + net.AddInputFromArray("Input", input_shape, input); + + OpDefBuilder("ReduceMean", "ReduceMeanTest") + .Input("Input") + .AddIntsArg("axis", axis) + .AddIntArg("keepdims", keepdims ? 1 : 0) + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + auto expected = net.CreateTensor(output_shape, output); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); +} + template void Simple12Test() { Simple({2, 2, 3, 4}, @@ -157,26 +179,6 @@ void Simple2Axis() { {0, 1}, {1, 1, 3, 4}, {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); - Simple({2, 3, 4}, - {0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, - 16, 17, 18, 19, - 20, 21, 22, 23}, - {0, 1}, - {1, 1, 4}, - {10, 11, 12, 13}); - Simple({2, 3, 4}, - {0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, - 16, 17, 18, 19, - 20, 21, 22, 23}, - {1, 2}, - {2, 1, 1}, - {5.5, 17.5}); Simple({1, 2, 3, 4}, {0, 1, 2, 3, 4, 5, 6, 7, @@ -220,6 +222,31 @@ void Simple2Axis() { {4, 13, 22}); } +template +void Simple2Axis3D() { + Simple3D({2, 3, 4}, + {0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 17, 18, 19, + 20, 21, 22, 23}, + {0, 1}, + {1, 1, 4}, + {10, 11, 12, 13}); + Simple3D({2, 3, 4}, + {0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 17, 18, 19, + 20, 21, 22, 23}, + {1, 2}, + {2, 1, 1}, + {5.5, 17.5}); +} + + template void Simple3Axis() { Simple({1, 2, 3, 4}, @@ -310,21 +337,22 @@ TEST_F(ReduceMeanOpTest, CPUSimple2Axis) { Simple2Axis(); } +TEST_F(ReduceMeanOpTest, CPUSimple2Axis3D) { + Simple2Axis3D(); +} + TEST_F(ReduceMeanOpTest, CPUSimple3Axis) { Simple3Axis(); } TEST_F(ReduceMeanOpTest, CPUSimpleReduceDims) { - Simple({2, 2, 3, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, - {1, 2}, - {2, 4}, - {10, 11, 12, 13, - 10, 11, 12, 13}, - false); + Simple3D({2, 3, 4}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, + {0, 1}, + {4}, + {10, 11, 12, 13}, + false); } namespace { @@ -338,21 +366,11 @@ void RandomTest(const std::vector &input_shape, // Add input data net.AddRandomInput("Input", input_shape); - std::vector axis_cpu(axis.size()); - for (unsigned int i = 0; i < axis.size(); ++i) { - if (axis[i] == 1 || axis[i] == 2) - axis_cpu[i] = axis[i] + 1; - else if (axis[i] == 3) - axis_cpu[i] = 1; - else - axis_cpu[i] = axis[i]; - } - net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); OpDefBuilder("ReduceMean", "ReduceMeanTest") .Input("InputNCHW") - .AddIntsArg("axis", axis_cpu) + .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) .Output("OutputNCHW") .Finalize(net.NewOperatorDef()); @@ -360,18 +378,14 @@ void RandomTest(const std::vector &input_shape, net.RunOp(); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ReduceMean", "ReduceMeanTest") - .Input("InputImg") + .Input("Input") .AddIntsArg("axis", axis) .AddIntArg("keepdims", 1) - .Output("OutputImg") + .Output("OPENCLOutput") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-5, 1e-4); diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc index 896fb1e0056fdb37aa095707a7504d5e75da7533..5ababebaa29676f289c368222bde120acf9c0aca 100644 --- a/mace/ops/resize_bicubic_benchmark.cc +++ b/mace/ops/resize_bicubic_benchmark.cc @@ -43,30 +43,13 @@ void ResizeBicubicBenchmark(int iters, } else { MACE_NOT_IMPLEMENTED; } - net.AddInputFromArray("OutSize", {2}, - {output_height, output_width}); - if (D == DeviceType::CPU) { - OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark") + OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark") .Input("Input") - .Input("OutSize") .Output("Output") .AddIntsArg("size", {output_height, output_width}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ResizeBicubic", "ResizeBicubicBenchmark") - .Input("InputImage") - .Input("OutSize") - .Output("OutputImage") - .AddIntsArg("size", {output_height, output_width}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc index 3a33eefc5b930ba0a89ae3d579e42efd5abdc620..5a4afc355a021179d0453344b6d2247a62721cf6 100644 --- a/mace/ops/resize_bicubic_test.cc +++ b/mace/ops/resize_bicubic_test.cc @@ -132,7 +132,7 @@ void TestRandomResizeBicubic() { // Add input data net.AddRandomInput("Input", {batch, in_height, in_width, channels}, - true, true); + false, true, true); net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); @@ -151,23 +151,17 @@ void TestRandomResizeBicubic() { expected.Copy(*net.GetOutput("Output")); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ResizeBicubic", "ResizeBicubicTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("align_corners", align_corners) .AddIntsArg("size", {height, width}) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "DeviceOutput", - ops::BufferType::IN_OUT_CHANNEL); } // Check - ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 1e-2, + ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-2, 1e-2); } } diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 345f86bb041730337a594922d09a6ca3d2a32743..bace4f10374d681df889e6fd5451c37abc2d646c 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -50,30 +50,12 @@ void ResizeBilinearBenchmark(int iters, } else { MACE_NOT_IMPLEMENTED; } - net.AddInputFromArray("OutSize", {2}, - {output_height, output_width}); - - if (D == DeviceType::CPU) { - OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") + OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") .Input("Input") - .Input("OutSize") .Output("Output") .AddIntsArg("size", {output_height, output_width}) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") - .Input("InputImage") - .Input("OutSize") - .Output("OutputImage") - .AddIntsArg("size", {output_height, output_width}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index b611854fadb86814fd8b24732ba9eb1de07931b9..e7b7a296929b0aae2fef068a072c03a9fdabfebc 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -118,23 +118,17 @@ void TestRandomResizeBilinear() { expected->Copy(*net.GetOutput("Output")); if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("align_corners", align_corners) .AddIntsArg("size", {height, width}) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - - ImageToBuffer(&net, "OutputImage", "DeviceOutput", - ops::BufferType::IN_OUT_CHANNEL); } // Check - ExpectTensorNear(*expected, *net.GetOutput("DeviceOutput"), 1e-5, + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-6); } } @@ -157,6 +151,7 @@ void TestQuantizedResizeBilinear() { net.AddRandomInput("Input", {batch, in_height, in_width, channels}, false, + false, true, -1.f, 1.f); diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc index 4a866ae7592787f89b002bac6153b820fc4aaf2a..5539e53f83be152a839e9bfa98178c2fedb933c6 100644 --- a/mace/ops/scalar_math.cc +++ b/mace/ops/scalar_math.cc @@ -93,7 +93,11 @@ class ScalarMathOp : public Operation { coeff_(Operation::GetRepeatedArgs("coeff")), scalar_input_(Operation::GetOptionalArg("scalar_input", 1.0)), scalar_input_index_(Operation::GetOptionalArg( - "scalar_input_index", 1)) {} + "scalar_input_index", 1)) { + if (D == DeviceType::GPU) { + context->set_output_mem_type(MemoryType::GPU_BUFFER); + } + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc index b981267aa3c9e20e3c793a5174c3813fb9c59d1a..675ab7c82a7fa553d9ec69cd6f4a77b68f5ceb98 100644 --- a/mace/ops/shape.cc +++ b/mace/ops/shape.cc @@ -21,7 +21,11 @@ template class ShapeOp : public Operation { public: explicit ShapeOp(OpConstructContext *context) - : Operation(context) {} + : Operation(context) { + if (D == DeviceType::GPU) { + context->set_output_mem_type(MemoryType::GPU_BUFFER); + } + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index bf06114430be46dfd37046921f09afa33ce3fe5d..4a7505ae79bcbc211ae9fa17f65a4f941b8988a2 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -367,6 +367,7 @@ class SoftmaxOp : public Operation { if (context->device()->opencl_runtime()->UseImageMemory()) { kernel_.reset(new opencl::image::SoftmaxKernel); } else { + context->set_output_mem_type(MemoryType::GPU_BUFFER); kernel_.reset(new opencl::buffer::SoftmaxKernel); } } diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 66e27434c82175b895272d5bab862ee83d0c5ae2..25095da54f94324afd34274f79b09c59c1b4e3a7 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -38,22 +38,11 @@ void SoftmaxBenchmark( MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("Softmax", "SoftmaxBM") + OpDefBuilder("Softmax", "SoftmaxBM") .Input("Input") .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("Softmax", "SoftmaxBM") - .Input("InputImage") - .Output("Output") - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 69b5dafdfeb8390615f926f092d77c0b47fea071..af32d4ab8ad97a10ed58707f02efdd1c67741fb1 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -59,21 +59,14 @@ void Simple() { net.GetOutput("Output")->Reshape({1, 1, 2, 4}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("Softmax", "SoftmaxTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - // Transfer output - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { MACE_NOT_IMPLEMENTED; @@ -115,22 +108,15 @@ void Complex(const std::vector &logits_shape) { auto expected = net.CreateTensor(); expected->Copy(*net.GetOutput("Output")); - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("Softmax", "SoftmaxTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run on gpu net.RunOp(D); - // Transfer output - ImageToBuffer(&net, "OutputImage", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - - ExpectTensorNear(*expected, *net.GetOutput("OPENCLOutput"), 1e-5); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace @@ -158,7 +144,7 @@ namespace { void TestQuantizedSoftmax(const std::vector &input_shape) { OpsTestNet net; - net.AddRandomInput("Input", input_shape, false, true); + net.AddRandomInput("Input", input_shape, false, false, true); OpDefBuilder("Softmax", "SoftmaxTest") .Input("Input") diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index f6d5ad1aab8a98352d4548e1002694b219a77334..cacadfcd9673019a9c3f7938d72ebc3d45608c96 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -38,24 +38,13 @@ void BMSpaceToBatch( net.AddRandomInput("Input", {batch, height, width, channels}); } - if (D == DeviceType::CPU) { - OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") - .Input("Input") - .Output("Output") - .AddIntsArg("paddings", {shape, shape, shape, shape}) - .AddIntsArg("block_shape", {shape, shape}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntsArg("paddings", {shape, shape, shape, shape}) - .AddIntsArg("block_shape", {shape, shape}) - .Finalize(net.NewOperatorDef()); - } + OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") + .Input("Input") + .Output("Output") + .AddIntsArg("paddings", {shape, shape, shape, shape}) + .AddIntsArg("block_shape", {shape, shape}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index 956dedc2c7f069c937bb09e5174225c426b3c7a5..3a928c6de0802ecd194ddae9723e4c4399a03dc1 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -32,11 +32,9 @@ void RunSpaceToBatch(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); if (D == GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntsArg("paddings", padding_data) .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); @@ -54,10 +52,7 @@ void RunSpaceToBatch(const std::vector &input_shape, // Run net.RunOp(D); - if (D == GPU) { - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else if (D == CPU) { + if (D == CPU) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } @@ -76,11 +71,9 @@ void RunBatchToSpace(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); if (D == GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntsArg("crops", crops_data) .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); @@ -98,10 +91,7 @@ void RunBatchToSpace(const std::vector &input_shape, // Run net.RunOp(D); - if (D == GPU) { - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } else if (D == CPU) { + if (D == CPU) { net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); } @@ -156,17 +146,13 @@ void TestSpaceToBatchLargeInput(const std::vector &input_shape, net.AddRandomInput("Input", input_shape); // run gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("OutputGPU") .AddIntsArg("paddings", padding_data) .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(GPU); - ImageToBuffer(&net, "OutputImage", "OutputGPU", - ops::BufferType::IN_OUT_CHANNEL); // run cpu net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -193,17 +179,13 @@ void TestoBatchToSpaceLargeInput(const std::vector &input_shape, net.AddRandomInput("Input", input_shape); // run gpu - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("OutputGPU") .AddIntsArg("crops", crops_data) .AddIntsArg("block_shape", block_shape_data) .Finalize(net.NewOperatorDef()); net.RunOp(GPU); - ImageToBuffer(&net, "OutputImage", "OutputGPU", - ops::BufferType::IN_OUT_CHANNEL); // run cpu net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -227,7 +209,13 @@ void TestSpaceToBatchQuantize(const std::vector &input_shape, const std::vector &block_shape_data, const std::vector &padding_data) { OpsTestNet net; - net.AddRandomInput("Input", input_shape, false, true, -1.f, 1.f); + net.AddRandomInput("Input", + input_shape, + false, + false, + true, + -1.f, + 1.f); // run cpu net.TransformDataFormat("Input", NHWC, "InputNCHW", @@ -282,7 +270,13 @@ void TestoBatchToSpaceQuantize(const std::vector &input_shape, const std::vector &block_shape_data, const std::vector &crops_data) { OpsTestNet net; - net.AddRandomInput("Input", input_shape, false, true, -1.f, 1.f); + net.AddRandomInput("Input", + input_shape, + false, + false, + true, + -1.f, + 1.f); // run cpu net.TransformDataFormat("Input", NHWC, "InputNCHW", diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc index 04760c5454457ab06848c3b715abd7697ef27ce0..3311d6186272cee46cc53f8e6d9426e9eb962295 100644 --- a/mace/ops/space_to_depth_benchmark.cc +++ b/mace/ops/space_to_depth_benchmark.cc @@ -36,23 +36,12 @@ void SpaceToDepth( MACE_NOT_IMPLEMENTED; } - if (D == DeviceType::CPU) { - OpDefBuilder("SpaceToDepth", "SpaceToDepthBM") + OpDefBuilder("SpaceToDepth", "SpaceToDepthBM") .Input("Input") .Output("Output") + .AddIntArg("block_size", block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - } else if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - OpDefBuilder("SpaceToDepth", "SpaceToDepthBM") - .Input("InputImage") - .Output("Output") - .AddIntArg("block_size", block_size) - .Finalize(net.NewOperatorDef()); - } else { - MACE_NOT_IMPLEMENTED; - } // Warm-up for (int i = 0; i < 5; ++i) { diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc index e7ae77d6637225b9a7377d5fcb6e806c22992931..a0c4a9b86a4994e5ba7d59e297ae274132a9db37 100644 --- a/mace/ops/space_to_depth_test.cc +++ b/mace/ops/space_to_depth_test.cc @@ -45,21 +45,15 @@ void RunSpaceToDepth(const std::vector &input_shape, "Output", NHWC); } else { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") - .Input("InputImage") - .Output("OutputImage") + .Input("Input") + .Output("Output") .AddIntArg("block_size", block_size) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } - if (D == DeviceType::GPU) { - ImageToBuffer(&net, "OutputImage", "Output", - ops::BufferType::IN_OUT_CHANNEL); - } auto expected = net.CreateTensor(expected_shape, expected_data); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -127,22 +121,16 @@ void RandomTest(const int block_size, net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - BufferToImage(&net, "Input", "InputImg", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("SpaceToDepth", "SpaceToDepthTest") - .Input("InputImg") + .Input("Input") .AddIntArg("block_size", block_size) .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("OutputImg") + .Output("OPENCLOutput") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-5); diff --git a/mace/ops/split.cc b/mace/ops/split.cc index 1d632329cdef22fd37a2689498b05bbf1f2a60dc..2e09663178c45495b670b75a72ac7a013f478dc0 100644 --- a/mace/ops/split.cc +++ b/mace/ops/split.cc @@ -31,16 +31,27 @@ class SplitOp : public Operation { public: explicit SplitOp(OpConstructContext *context) : Operation(context), - axis_(Operation::GetOptionalArg("axis", 3)) {} + axis_(Operation::GetOptionalArg("axis", 3)), + checked_(false) {} + + void Validate() { + if (this->Input(0)->dim_size() == 4) { + if (axis_ == 3) axis_ = 1; + else if (axis_ == 2) axis_ = 3; + else if (axis_ == 1) axis_ = 2; + } + MACE_CHECK(this->OutputSize() >= 2) + << "There must be at least two outputs for slicing"; + MACE_CHECK((this->Input(0)->dim(axis_) % this->OutputSize()) == 0) + << "Outputs do not split input equally."; + checked_ = true; + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); - MACE_CHECK(this->OutputSize() >= 2) - << "There must be at least two outputs for slicing"; + if (!checked_) Validate(); const Tensor *input = this->Input(0); const std::vector output_list = this->Outputs(); - MACE_CHECK((input->dim(axis_) % this->OutputSize()) == 0) - << "Outputs do not split input equally."; const index_t input_channels = input->dim(axis_); const size_t outputs_count = output_list.size(); const index_t output_channels = input_channels / outputs_count; @@ -83,6 +94,7 @@ class SplitOp : public Operation { private: int32_t axis_; + bool checked_; }; diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc index 687fc5739bdb9f01c262b4cebfd7cf1361890f7d..b21da8f5c7f055437a6a59952c3bea4957636efd 100644 --- a/mace/ops/split_benchmark.cc +++ b/mace/ops/split_benchmark.cc @@ -37,26 +37,14 @@ void BMSplitHelper(int iters, GenerateRandomRealTypeData(input_shape, &input_data); net.AddInputFromArray("Input", input_shape, input_data); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - auto builder = OpDefBuilder("Split", "SplitTest"); - builder.Input("InputImage"); - for (int i = 0; i < num_outputs; ++i) { - builder = builder.Output(MakeString("OutputImage", i)); - } - builder - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - auto builder = OpDefBuilder("Split", "SplitTest"); - builder.Input("Input"); - for (int i = 0; i < num_outputs; ++i) { - builder = builder.Output(MakeString("Output", i)); - } - builder.Finalize(net.NewOperatorDef()); + auto builder = OpDefBuilder("Split", "SplitTest"); + builder.Input("Input"); + for (int i = 0; i < num_outputs; ++i) { + builder = builder.Output(MakeString("Output", i)); } + builder + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Warm-up for (int i = 0; i < 2; ++i) { diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc index 906a47dd8f0a3ad8b74f091bd5c50b20d53669ad..89fbbadefbb39b4a3bc6446f8c6ed58e074636f5 100644 --- a/mace/ops/split_test.cc +++ b/mace/ops/split_test.cc @@ -26,7 +26,7 @@ class SplitOpTest : public OpsTestBase {}; namespace { template -void RandomTest(const int num_outputs, const int axis) { +void RandomTest(const int num_outputs, int axis) { static unsigned int seed = time(NULL); const index_t output_channels = 4 * (1 + rand_r(&seed) % 10); const index_t input_channels = num_outputs * output_channels; @@ -38,9 +38,9 @@ void RandomTest(const int num_outputs, const int axis) { OpsTestNet net; std::vector input_shape; - if (axis == 1) + if (D == DeviceType::CPU) input_shape = {batch, input_channels, height, width}; - else if (axis == 3) + else input_shape = {batch, height, width, input_channels}; const index_t input_size = std::accumulate( input_shape.begin(), input_shape.end(), 1, std::multiplies()); @@ -48,43 +48,25 @@ void RandomTest(const int num_outputs, const int axis) { GenerateRandomRealTypeData(input_shape, &input_data); net.AddInputFromArray("Input", input_shape, input_data); - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - - auto builder = OpDefBuilder("Split", "SplitTest"); - builder.Input("InputImage"); - for (int i = 0; i < num_outputs; ++i) { - builder = builder.Output(MakeString("OutputImage", i)); - } - builder.AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - auto builder = OpDefBuilder("Split", "SplitTest").AddIntArg("axis", axis); - builder.Input("Input"); - for (int i = 0; i < num_outputs; ++i) { - builder = builder.Output(MakeString("Output", i)); - } - builder.Finalize(net.NewOperatorDef()); + auto builder = OpDefBuilder("Split", "SplitTest").AddIntArg("axis", axis); + builder.Input("Input"); + for (int i = 0; i < num_outputs; ++i) { + builder = builder.Output(MakeString("Output", i)); } + builder.AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - if (D == DeviceType::GPU) { - for (int i = 0; i < num_outputs; ++i) { - ImageToBuffer(&net, MakeString("OutputImage", i), - MakeString("Output", i), - ops::BufferType::IN_OUT_CHANNEL); - } - } - // Check std::vector expected_shape; - if (axis == 1) + if (D == DeviceType::CPU) { + if (axis == 3) axis = 1; expected_shape = {batch, output_channels, height, width}; - else if (axis == 3) + } else { expected_shape = {batch, height, width, output_channels}; + } const index_t outer_size = std::accumulate(expected_shape.begin(), expected_shape.begin() + axis, 1, std::multiplies()); @@ -117,9 +99,9 @@ TEST_F(SplitOpTest, CPU) { } TEST_F(SplitOpTest, CPUAxis1) { - RandomTest(2, 1); - RandomTest(4, 1); - RandomTest(11, 1); + RandomTest(2, 3); + RandomTest(4, 3); + RandomTest(11, 3); } TEST_F(SplitOpTest, OPENCLFloat) { diff --git a/mace/ops/sqrdiff_mean_benchmark.cc b/mace/ops/sqrdiff_mean_benchmark.cc index bcf075004835b99e347c2139acd3f2e2244a65aa..353d8e7addfa4748fb7a160710bea226d3c569ab 100644 --- a/mace/ops/sqrdiff_mean_benchmark.cc +++ b/mace/ops/sqrdiff_mean_benchmark.cc @@ -29,35 +29,21 @@ void SqrDiffMean(int iters, int batch, int channels, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Input1", {batch, 1, 1, channels}); - - if (D == DeviceType::GPU) { - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImage1", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM") - .Input("InputImage") - .Input("InputImage1") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + if (D == DeviceType::CPU) { + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input1", {batch, channels, 1, 1}); } else { - net.TransformDataFormat("Input", - NHWC, - "InputNCHW", - NCHW); - net.TransformDataFormat("Input1", - NHWC, - "InputNCHW1", - NCHW); - OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM") - .Input("InputNCHW") - .Input("InputNCHW1") - .Output("Output") - .Finalize(net.NewOperatorDef()); + net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Input1", {batch, 1, 1, channels}); } + OpDefBuilder("SqrDiffMean", "SqrDiffMeanBM") + .Input("Input") + .Input("Input1") + .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); diff --git a/mace/ops/sqrdiff_mean_test.cc b/mace/ops/sqrdiff_mean_test.cc index 66f852b71653b2b2769dd1fac46542bd7a8b48b3..d71e8f7f0107af479b7f728dde253f63b728fe05 100644 --- a/mace/ops/sqrdiff_mean_test.cc +++ b/mace/ops/sqrdiff_mean_test.cc @@ -58,19 +58,13 @@ void Simple(const std::vector &input_shape0, "Output", NHWC); } else { - BufferToImage(&net, "Input0", "InputImg0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") - .Input("InputImg0") - .Input("InputImg1") - .Output("OutputImg") + .Input("Input0") + .Input("Input1") + .Output("Output") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "Output", - ops::BufferType::IN_OUT_CHANNEL); } auto expected = net.CreateTensor(output_shape, output); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5, 1e-3); @@ -126,19 +120,13 @@ void RandomTest(const std::vector &input_shape0, net.RunOp(); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); - BufferToImage(&net, "Input0", "InputImg0", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Input1", "InputImg1", - ops::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SqrDiffMean", "SqrDiffMeanTest") - .Input("InputImg0") - .Input("InputImg1") - .Output("OutputImg") + .Input("Input0") + .Input("Input1") + .Output("OPENCLOutput") .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(&net, "OutputImg", "OPENCLOutput", - ops::BufferType::IN_OUT_CHANNEL); if (DataTypeToEnum::value == DT_FLOAT) { ExpectTensorNear(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-4, 1e-3); diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc index 0cd15752f52adc7383239591c96c077e4354ac04..bf86a84feb33026047c44951e2acdfbc30467ec2 100644 --- a/mace/ops/squeeze.cc +++ b/mace/ops/squeeze.cc @@ -25,10 +25,20 @@ class SqueezeOp : public Operation { public: explicit SqueezeOp(OpConstructContext *context) : Operation(context), - axis_(Operation::GetRepeatedArgs("axis", {})) {} + axis_(Operation::GetRepeatedArgs("axis", {})), + checked_(false) {} MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); + if (!checked_ && D == DeviceType::CPU + && DataTypeToEnum::value != DT_UINT8 + && this->Input(0)->dim_size() == 4) { + if (axis_.size() == 2 && axis_[0] == 1 && axis_[1] == 2) { + axis_[0] = 2; + axis_[1] = 3; + } + checked_ = true; + } const Tensor *input = this->Input(0); Tensor *output = this->Output(0); @@ -48,6 +58,7 @@ class SqueezeOp : public Operation { private: std::vector axis_; + bool checked_; }; void RegisterSqueeze(OpRegistryBase *op_registry) { diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc index 1bcd6c37c41facaa857b9d31a46b17c4bdc5178d..b0fc972cd0479d52bbbd0eff3e96a0bdd7b0a176 100644 --- a/mace/ops/squeeze_test.cc +++ b/mace/ops/squeeze_test.cc @@ -58,6 +58,7 @@ TEST_F(SqueezeOpTest, TestSqueeze) { TestSqueeze({1, 2, 1, 4}, {1}, {1, 2, 1, 4}); TestSqueeze({1, 2, 1, 4}, {2}, {1, 2, 4}); TestSqueeze({1}, {}, {}); + TestSqueeze({1, 4, 1, 1}, {1, 2}, {1, 4}); } } // namespace test diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc index de795965d3211d50d8c29aabbe87294754dbe502..f6269b0f4a08d471a0e25efbe3374142e5a9e20c 100644 --- a/mace/ops/stack.cc +++ b/mace/ops/stack.cc @@ -25,7 +25,11 @@ class StackOp : public Operation { public: explicit StackOp(OpConstructContext *context) : Operation(context), - axis_(Operation::GetOptionalArg("axis", 0)) {} + axis_(Operation::GetOptionalArg("axis", 0)) { + if (D == DeviceType::GPU) { + context->set_output_mem_type(MemoryType::GPU_BUFFER); + } + } MaceStatus Run(OpContext *context) override { MACE_UNUSED(context); diff --git a/mace/ops/transformer.cc b/mace/ops/transformer.cc deleted file mode 100644 index 7df66ffaf96f79d84d2ef454f16728e959386373..0000000000000000000000000000000000000000 --- a/mace/ops/transformer.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/ops/transformer.h" - -#include -#include - -namespace mace { -namespace ops { - -std::unique_ptr Transformer::DoTransform( - mace::OperatorDef *op_def, - const int input_idx, - const mace::DataType dt, - const BufferType buffer_type, - const MemoryType mem_type) { - int32_t device = op_def->device_type(); - std::string input_name = op_def->input(input_idx); - std::string output_name = input_name + "_transformed"; - - op_def->set_input(input_idx, output_name); - std::unique_ptr op(new OperatorDef); - op->set_name(output_name); - op->set_type("BufferTransform"); - op->add_input(input_name); - op->add_output(output_name); - Argument *arg = op->add_arg(); - arg->set_name("buffer_type"); - arg->set_i(static_cast(buffer_type)); - arg = op->add_arg(); - arg->set_name("mem_type"); - arg->set_i(static_cast(mem_type)); - arg = op->add_arg(); - arg->set_name("T"); - arg->set_i(static_cast(dt)); - arg = op->add_arg(); - arg->set_name("device"); - arg->set_i(device); - - return std::move(op); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/transformer.h b/mace/ops/transformer.h deleted file mode 100644 index 67ecd60f768f9b4326f8f331fa600134d0d8776b..0000000000000000000000000000000000000000 --- a/mace/ops/transformer.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MACE_KERNELS_TRANSFORMER_H_ -#define MACE_KERNELS_TRANSFORMER_H_ - -#include "mace/core/transformer.h" -#include "mace/ops/opencl/common.h" - -namespace mace { -class OpContext; -namespace ops { - -class Transformer : public TransformerBase { - public: - // Transform source tensor to target. - std::vector> ConstructTranformOp( - OperatorDef *op_def, - bool transform_filter = true) override; - private: - std::unique_ptr DoTransform( - mace::OperatorDef *op_def, - const int input_idx, - const mace::DataType dt, - const BufferType buffer_type, - const MemoryType mem_type); -}; - - -} // namespace ops -} // namespace mace - -#endif // MACE_KERNELS_TENSOR_TRANSFORMER_H_ diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc index 4e98944cd889aa4669fe8fd1d53003b6d069896d..7c25ea4f5b679eef411202bfdbe0d01a03aa2977 100644 --- a/mace/ops/transpose.cc +++ b/mace/ops/transpose.cc @@ -20,14 +20,16 @@ #include #include "mace/core/operator.h" +#include "mace/ops/transpose.h" namespace mace { namespace ops { -static void TransposeNHWCToNCHWC3(const float *input, - float *output, - const index_t height, - const index_t width) { +namespace { +void TransposeNHWCToNCHWC3(const float *input, + float *output, + const index_t height, + const index_t width) { index_t image_size = height * width; #pragma omp parallel for @@ -62,10 +64,10 @@ static void TransposeNHWCToNCHWC3(const float *input, } } -static void TransposeNCHWToNHWCC2(const float *input, - float *output, - const index_t height, - const index_t width) { +void TransposeNCHWToNHWCC2(const float *input, + float *output, + const index_t height, + const index_t width) { index_t image_size = height * width; #pragma omp parallel for for (index_t h = 0; h < height; ++h) { @@ -97,9 +99,125 @@ static void TransposeNCHWToNHWCC2(const float *input, #endif } } +} // namespace + +MaceStatus Transpose(const float *input, + const std::vector &input_shape, + const std::vector &dst_dims, + float *output) { + MACE_CHECK((input_shape.size() == 2 && dst_dims.size() == 2) || + (input_shape.size() == 4 && dst_dims.size() == 4), + "Only support 2D or 4D transpose"); + + std::vector output_shape; + for (size_t i = 0; i < dst_dims.size(); ++i) { + output_shape.push_back(input_shape[dst_dims[i]]); + } + + if (input_shape.size() == 2) { + MACE_CHECK(dst_dims[0] == 1 && dst_dims[1] == 0, "no need transform"); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t stride_i = height; + index_t stride_j = width; + index_t tile_size = height > 512 || width > 512 ? 64 : 32; +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < height; i += tile_size) { + for (index_t j = 0; j < width; j += tile_size) { + index_t end_i = std::min(i + tile_size, height); + index_t end_j = std::min(j + tile_size, width); + for (index_t tile_i = i; tile_i < end_i; ++tile_i) { + for (index_t tile_j = j; tile_j < end_j; ++tile_j) { + output[tile_j * stride_i + tile_i] = + input[tile_i * stride_j + tile_j]; + } + } + } + } + } else if (input_shape.size() == 4) { + std::vector transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2}; + std::vector transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1}; + index_t batch_size = input_shape[1] * input_shape[2] * input_shape[3]; + + if (dst_dims == transpose_order_from_NHWC_to_NCHW && input_shape[3] == 3) { + for (index_t b = 0; b < input_shape[0]; ++b) { + TransposeNHWCToNCHWC3(input + b * batch_size, + output + b * batch_size, + input_shape[1], + input_shape[2]); + } + } else if (dst_dims == transpose_order_from_NCHW_to_NHWC + && input_shape[1] == 2) { + for (index_t b = 0; b < input_shape[0]; ++b) { + TransposeNCHWToNHWCC2(input + b * batch_size, + output + b * batch_size, + input_shape[2], + input_shape[3]); + } + } else if (dst_dims == std::vector{0, 2, 1, 3}) { + index_t height = input_shape[1]; + index_t width = input_shape[2]; + index_t channel = input_shape[3]; + index_t channel_raw_size = channel * sizeof(float); + index_t stride_i = height; + index_t stride_j = width; + index_t tile_size = std::max(static_cast(1), + static_cast(std::sqrt( + 8 * 1024 / channel))); +#pragma omp parallel for collapse(2) + for (index_t i = 0; i < height; i += tile_size) { + for (index_t j = 0; j < width; j += tile_size) { + index_t end_i = std::min(i + tile_size, height); + index_t end_j = std::min(j + tile_size, width); + for (index_t tile_i = i; tile_i < end_i; ++tile_i) { + for (index_t tile_j = j; tile_j < end_j; ++tile_j) { + memcpy(output + (tile_j * stride_i + tile_i) * channel, + input + (tile_i * stride_j + tile_j) * channel, + channel_raw_size); + } + } + } + } + } else { + std::vector + in_stride{input_shape[1] * input_shape[2] * input_shape[3], + input_shape[2] * input_shape[3], input_shape[3], 1}; + std::vector + out_stride{output_shape[1] * output_shape[2] * output_shape[3], + output_shape[2] * output_shape[3], output_shape[3], 1}; + + std::vector idim(4, 0); + std::vector odim(4, 0); + for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) { + for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) { + for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) { + for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) { + idim[dst_dims[0]] = odim[0]; + idim[dst_dims[1]] = odim[1]; + idim[dst_dims[2]] = odim[2]; + idim[dst_dims[3]] = odim[3]; + + output[odim[0] * out_stride[0] + odim[1] * out_stride[1] + + odim[2] * out_stride[2] + odim[3]] = + input[idim[0] * in_stride[0] + idim[1] * in_stride[1] + + idim[2] * in_stride[2] + idim[3]]; + } + } + } + } + } + } else { + MACE_NOT_IMPLEMENTED; + } + + return MaceStatus::MACE_SUCCESS; +} template -class TransposeOp : public Operation { +class TransposeOp; + +template +class TransposeOp : public Operation { public: explicit TransposeOp(OpConstructContext *context) : Operation(context), @@ -121,106 +239,10 @@ class TransposeOp : public Operation { Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); - const T *input_data = input->data(); - T *output_data = output->mutable_data(); - - if (input->dim_size() == 2) { - MACE_CHECK(dims_[0] == 1 && dims_[1] == 0, "no need transform"); - index_t height = input_shape[0]; - index_t width = input_shape[1]; - index_t stride_i = height; - index_t stride_j = width; - index_t tile_size = height > 512 || width > 512 ? 64 : 32; -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < height; i += tile_size) { - for (index_t j = 0; j < width; j += tile_size) { - index_t end_i = std::min(i + tile_size, height); - index_t end_j = std::min(j + tile_size, width); - for (index_t tile_i = i; tile_i < end_i; ++tile_i) { - for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - output_data[tile_j * stride_i + tile_i] = - input_data[tile_i * stride_j + tile_j]; - } - } - } - } - } else if (input->dim_size() == 4) { - std::vector transpose_order_from_NHWC_to_NCHW{0, 3, 1, 2}; - std::vector transpose_order_from_NCHW_to_NHWC{0, 2, 3, 1}; - index_t batch_size = input->dim(1) * input->dim(2) * input->dim(3); - - if (dims_ == transpose_order_from_NHWC_to_NCHW && input->dim(3) == 3) { - for (index_t b = 0; b < input->dim(0); ++b) { - TransposeNHWCToNCHWC3(input_data + b * batch_size, - output_data + b * batch_size, - input->dim(1), - input->dim(2)); - } - } else if (dims_ == transpose_order_from_NCHW_to_NHWC - && input->dim(1) == 2) { - for (index_t b = 0; b < input->dim(0); ++b) { - TransposeNCHWToNHWCC2(input_data + b * batch_size, - output_data + b * batch_size, - input->dim(2), - input->dim(3)); - } - } else if (dims_ == std::vector{0, 2, 1, 3}) { - index_t height = input_shape[1]; - index_t width = input_shape[2]; - index_t channel = input_shape[3]; - index_t channel_raw_size = channel * sizeof(T); - index_t stride_i = height; - index_t stride_j = width; - index_t tile_size = std::max(static_cast(1), - static_cast(std::sqrt( - 8 * 1024 / channel))); -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < height; i += tile_size) { - for (index_t j = 0; j < width; j += tile_size) { - index_t end_i = std::min(i + tile_size, height); - index_t end_j = std::min(j + tile_size, width); - for (index_t tile_i = i; tile_i < end_i; ++tile_i) { - for (index_t tile_j = j; tile_j < end_j; ++tile_j) { - memcpy(output_data + (tile_j * stride_i + tile_i) * channel, - input_data + (tile_i * stride_j + tile_j) * channel, - channel_raw_size); - } - } - } - } - } else { - std::vector - in_stride{input_shape[1] * input_shape[2] * input_shape[3], - input_shape[2] * input_shape[3], input_shape[3], 1}; - std::vector - out_stride{output_shape[1] * output_shape[2] * output_shape[3], - output_shape[2] * output_shape[3], output_shape[3], 1}; - - std::vector idim(4, 0); - std::vector odim(4, 0); - for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) { - for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) { - for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) { - for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) { - idim[dims_[0]] = odim[0]; - idim[dims_[1]] = odim[1]; - idim[dims_[2]] = odim[2]; - idim[dims_[3]] = odim[3]; - - output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1] - + odim[2] * out_stride[2] + odim[3]] = - input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1] - + idim[2] * in_stride[2] + idim[3]]; - } - } - } - } - } - } else { - MACE_NOT_IMPLEMENTED; - } + const float *input_data = input->data(); + float *output_data = output->mutable_data(); - return MaceStatus::MACE_SUCCESS; + return Transpose(input_data, input->shape(), dims_, output_data); } private: diff --git a/mace/ops/opencl/common.h b/mace/ops/transpose.h similarity index 66% rename from mace/ops/opencl/common.h rename to mace/ops/transpose.h index f0bf872eb84c4b4dd1705ec0b594b10d987b03a7..c4ab39dcaa5ed87877eda681febf82901dfa2b81 100644 --- a/mace/ops/opencl/common.h +++ b/mace/ops/transpose.h @@ -12,23 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_COMMON_H_ -#define MACE_OPS_OPENCL_COMMON_H_ +#ifndef MACE_OPS_TRANSPOSE_H_ +#define MACE_OPS_TRANSPOSE_H_ + +#include + +#include "mace/public/mace.h" namespace mace { namespace ops { -enum BufferType { - CONV2D_FILTER = 0, - IN_OUT_CHANNEL = 1, - ARGUMENT = 2, - IN_OUT_HEIGHT = 3, - IN_OUT_WIDTH = 4, - WINOGRAD_FILTER = 5, - DW_CONV2D_FILTER = 6, - WEIGHT_HEIGHT = 7, - WEIGHT_WIDTH = 8, -}; +MaceStatus Transpose(const float *input, + const std::vector &input_shape, + const std::vector &dst_dims, + float *output); + } // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_COMMON_H_ + +#endif // MACE_OPS_TRANSPOSE_H_ diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc deleted file mode 100644 index 624851657e7b704e4eea46a213978541facd52dc..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_convolution_benchmark.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/testing/test_benchmark.h" -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -namespace { -template -void BMWinogradConvolution( - int iters, int batch, int height, int width, - int in_channels, int out_channels, int block_size) { - mace::testing::StopTiming(); - OpsTestNet net; - net.AddRandomInput("Input", {batch, height, width, in_channels}); - - net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); - net.AddRandomInput("Bias", {out_channels}); - - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT); - - // Winograd convolution - // transform filter - BufferToImage(&net, "Filter", "WinoFilter", - ops::BufferType::WINOGRAD_FILTER, block_size); - - // Inference convolution output shape - OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest") - .Input("InputImage") - .Output("ShapeOutput") - .AddIntArg("data_format", 0) - .AddIntsArg("strides", {1, 1}) - .AddIntsArg("kernels", {static_cast(out_channels), - static_cast(in_channels), - 3, 3}) - .AddIntArg("padding", Padding::SAME) - .OutputType({DataTypeToEnum::v()}) - .Finalize(net.NewOperatorDef()); - - // Transform input - OpDefBuilder("WinogradTransform", "WinogradTransformTest") - .Input("InputImage") - .Output("WinoInput") - .AddIntArg("padding", Padding::SAME) - .AddIntArg("wino_block_size", block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.AddNewOperatorDef()); - - // MatMul - OpDefBuilder("MatMul", "MatMulTest") - .Input("WinoFilter") - .Input("WinoInput") - .Output("WinoGemm") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.AddNewOperatorDef()); - - // Inverse transform - OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") - .Input("WinoGemm") - .Input("ShapeOutput") - .Input("BiasImage") - .AddIntArg("batch", batch) - .AddIntArg("height", height) - .AddIntArg("width", width) - .AddIntArg("wino_block_size", block_size) - .Output("OutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.AddNewOperatorDef()); - net.Setup(D); - // Warm-up - for (int i = 0; i < 5; ++i) { - net.Run(); - } - net.Sync(); - mace::testing::StartTiming(); - while (iters--) { - net.Run(); - } - net.Sync(); -} -} // namespace - -#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE) \ - static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\ - TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * IC * H * W; \ - const int64_t macc = \ - static_cast(iters) * N * OC * H * W * (3 * 3 * IC + 1); \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMWinogradConvolution(iters, N, H, W, IC, OC, M); \ - } \ - MACE_BENCHMARK( \ - MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE) - -#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \ - MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU); - - -MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2); -MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2); -MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2); -MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4); -MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4); -MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4); -MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2); -MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4); -MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2); -MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4); -MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2); -MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4); - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc deleted file mode 100644 index 556ee0ba8a3d20de45711b4b201682fcf662a9e6..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_convolution_test.cc +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -class WinogradConvolutionTest : public OpsTestBase {}; - -namespace { - -template -void WinogradConvolution(const index_t batch, - const index_t height, - const index_t width, - const index_t in_channels, - const index_t out_channels, - const Padding padding, - const int block_size) { - // srand(time(NULL)); - - // Construct graph - OpsTestNet net; - // Add input data - net.AddRandomInput("Input", {batch, height, width, in_channels}); - net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); - net.AddRandomInput("Bias", {out_channels}); - - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "ConvOutput", - ops::BufferType::IN_OUT_CHANNEL); - - auto expected = net.CreateTensor(); - expected->Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected->shape(); - - // Winograd convolution - // transform filter - BufferToImage(&net, "Filter", "WinoFilter", - ops::BufferType::WINOGRAD_FILTER, block_size); - // transform input - OpDefBuilder("WinogradTransform", "WinogradTransformTest") - .Input("InputImage") - .Output("WinoInput") - .AddIntArg("padding", padding) - .AddIntArg("wino_block_size", block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // Run on opencl - net.RunOp(D); - - OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest") - .Input("InputImage") - .Output("ShapeOutput") - .AddIntArg("data_format", 0) - .AddIntsArg("strides", {1, 1}) - .AddIntsArg("kernels", {static_cast(out_channels), - static_cast(in_channels), - 3, 3}) - .AddIntArg("padding", padding) - .OutputType({DataTypeToEnum::v()}) - .Finalize(net.NewOperatorDef()); - net.RunOp(D); - - // MatMul - OpDefBuilder("MatMul", "MatMulTest") - .Input("WinoFilter") - .Input("WinoInput") - .Output("WinoGemm") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on opencl - net.RunOp(D); - - // Inverse transform - OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") - .Input("WinoGemm") - .Input("ShapeOutput") - .Input("BiasImage") - .AddIntArg("wino_block_size", block_size) - .Output("WinoOutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // Run on opencl - net.RunOp(D); - net.Sync(); - - ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), - 1e-2, 1e-2); - } else { - ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), - 1e-5, 1e-4); - } -} -} // namespace - -TEST_F(WinogradConvolutionTest, AlignedConvolutionM2) { - WinogradConvolution(1, 32, 32, 3, 3, - Padding::VALID, 2); - WinogradConvolution(1, 32, 32, 3, 3, - Padding::SAME, 2); -} - -TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2) { - WinogradConvolution(1, 61, 67, 31, 37, - Padding::VALID, 2); - WinogradConvolution(1, 61, 67, 37, 31, - Padding::SAME, 2); -} - -TEST_F(WinogradConvolutionTest, BatchConvolutionM2) { - WinogradConvolution(3, 64, 64, 32, 32, - Padding::VALID, 2); - WinogradConvolution(5, 61, 67, 37, 31, - Padding::SAME, 2); -} - -TEST_F(WinogradConvolutionTest, AlignedConvolutionM4) { - WinogradConvolution(1, 32, 32, 3, 3, - Padding::VALID, 4); - WinogradConvolution(1, 32, 32, 3, 3, - Padding::SAME, 4); -} - -TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) { - WinogradConvolution(1, 61, 67, 31, 37, - Padding::VALID, 4); - WinogradConvolution(1, 61, 67, 37, 31, - Padding::SAME, 4); -} - -TEST_F(WinogradConvolutionTest, BatchConvolutionM4) { - WinogradConvolution(3, 107, 113, 5, 7, - Padding::VALID, 4); - WinogradConvolution(5, 107, 113, 5, 7, - Padding::SAME, 4); -} - -namespace { -template -void WinogradConvolutionWithPad(const index_t batch, - const index_t height, - const index_t width, - const index_t in_channels, - const index_t out_channels, - const int padding, - const int block_size) { - // srand(time(NULL)); - - // Construct graph - OpsTestNet net; - // Add input data - net.AddRandomInput("Input", {batch, height, width, in_channels}); - net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); - net.AddRandomInput("Bias", {out_channels}); - - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - BufferToImage(&net, "Filter", "FilterImage", - ops::BufferType::CONV2D_FILTER); - BufferToImage(&net, "Bias", "BiasImage", ops::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntsArg("padding_values", {padding, padding}) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - net.RunOp(D); - - // Transfer output - ImageToBuffer(&net, "OutputImage", "ConvOutput", - ops::BufferType::IN_OUT_CHANNEL); - auto expected = net.CreateTensor(); - expected->Copy(*net.GetOutput("ConvOutput")); - auto output_shape = expected->shape(); - - // Winograd convolution - // transform filter - BufferToImage(&net, "Filter", "WinoFilter", - ops::BufferType::WINOGRAD_FILTER, block_size); - // transform input - OpDefBuilder("WinogradTransform", "WinogradTransformTest") - .Input("InputImage") - .Output("WinoInput") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntsArg("padding_values", {padding, padding}) - .AddIntArg("wino_block_size", block_size) - .Finalize(net.NewOperatorDef()); - - // Run on opencl - net.RunOp(D); - - OpDefBuilder("InferConv2dShape", "InferConv2dShapeTest") - .Input("InputImage") - .Output("ShapeOutput") - .AddIntArg("data_format", 0) - .AddIntsArg("strides", {1, 1}) - .AddIntsArg("kernels", {static_cast(out_channels), - static_cast(in_channels), - 3, 3}) - .AddIntsArg("padding_values", {padding, padding}) - .OutputType({DataTypeToEnum::v()}) - .Finalize(net.NewOperatorDef()); - net.RunOp(D); - - // MatMul - OpDefBuilder("MatMul", "MatMulTest") - .Input("WinoFilter") - .Input("WinoInput") - .Output("WinoGemm") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - // Run on opencl - net.RunOp(D); - - // Inverse transform - OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") - .Input("WinoGemm") - .Input("ShapeOutput") - .Input("BiasImage") - .AddIntArg("wino_block_size", block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Output("WinoOutputImage") - .Finalize(net.NewOperatorDef()); - - // Run on opencl - net.RunOp(D); - net.Sync(); - - ImageToBuffer(&net, "WinoOutputImage", "WinoOutput", - ops::BufferType::IN_OUT_CHANNEL); - if (DataTypeToEnum::value == DataType::DT_HALF) { - ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), - 1e-2, 1e-2); - } else { - ExpectTensorNear(*expected, *net.GetOutput("WinoOutput"), - 1e-5, 1e-4); - } -} -} // namespace - -TEST_F(WinogradConvolutionTest, AlignedConvolutionM2WithPad) { - WinogradConvolutionWithPad(1, 32, 32, 32, 16, - 1, 2); - WinogradConvolutionWithPad(1, 32, 32, 32, 16, - 2, 2); -} - -TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2WithPad) { - WinogradConvolutionWithPad(1, 61, 67, 31, 37, - 1, 2); - WinogradConvolutionWithPad(1, 61, 67, 37, 31, - 2, 2); -} - -TEST_F(WinogradConvolutionTest, BatchConvolutionWithM2Pad) { - WinogradConvolutionWithPad(3, 64, 64, 32, 32, - 1, 2); - WinogradConvolutionWithPad(5, 61, 67, 37, 31, - 2, 2); -} - -TEST_F(WinogradConvolutionTest, AlignedConvolutionM4WithPad) { - WinogradConvolutionWithPad(1, 32, 32, 32, 16, - 1, 4); - WinogradConvolutionWithPad(1, 32, 32, 32, 16, - 2, 4); -} - -TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4WithPad) { - WinogradConvolutionWithPad(1, 61, 67, 31, 37, - 1, 4); - WinogradConvolutionWithPad(1, 61, 67, 37, 31, - 2, 4); -} - -TEST_F(WinogradConvolutionTest, BatchConvolutionWithM4Pad) { - WinogradConvolutionWithPad(3, 64, 64, 32, 32, - 1, 4); - WinogradConvolutionWithPad(5, 61, 67, 37, 31, - 2, 4); -} - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc deleted file mode 100644 index b2635f4de1d9f622d99808af2d1dc7fcb69c720b..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_transform.cc +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "mace/core/operator.h" -#include "mace/ops/activation.h" -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/opencl/image/winograd_transform.h" - -namespace mace { -namespace ops { - -template -class WinogradTransformOp; - -template -class WinogradTransformOp : public Operation { - public: - explicit WinogradTransformOp(OpConstructContext *context) - : Operation(context) { - Padding padding_type = static_cast(Operation::GetOptionalArg( - "padding", static_cast(VALID))); - std::vector paddings = Operation::GetRepeatedArgs( - "padding_values"); - int block_size = Operation::GetOptionalArg("wino_block_size", 2); - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::WinogradTransformKernel( - padding_type, paddings, block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } - } - - MaceStatus Run(OpContext *context) override { - const Tensor *input_tensor = this->Input(0); - Tensor *output_tensor = this->Output(0); - return kernel_->Compute(context, input_tensor, output_tensor); - } - - private: - std::unique_ptr kernel_; -}; - -template -class WinogradInverseTransformOp; - -template -class WinogradInverseTransformOp : public Operation { - public: - explicit WinogradInverseTransformOp(OpConstructContext *context) - : Operation(context) { - ActivationType activation = ops::StringToActivationType( - Operation::GetOptionalArg("activation", "NOOP")); - float relux_max_limit = Operation::GetOptionalArg("max_limit", 0.0f); - int block_size = Operation::GetOptionalArg("wino_block_size", 2); - if (context->device()->opencl_runtime()->UseImageMemory()) { - kernel_.reset(new opencl::image::WinogradInverseTransformKernel( - activation, relux_max_limit, block_size)); - } else { - MACE_NOT_IMPLEMENTED; - } - } - - MaceStatus Run(OpContext *context) override { - Tensor *output_tensor = this->Output(0); - return kernel_->Compute(context, inputs_, output_tensor); - } - - private: - std::unique_ptr kernel_; -}; - -void RegisterWinogradTransform(OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "WinogradTransform", - WinogradTransformOp, DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "WinogradTransform", - WinogradTransformOp, DeviceType::GPU, half); -} - -void RegisterWinogradInverseTransform( - OpRegistryBase *op_registry) { - MACE_REGISTER_OP(op_registry, "WinogradInverseTransform", - WinogradInverseTransformOp, DeviceType::GPU, float); - MACE_REGISTER_OP(op_registry, "WinogradInverseTransform", - WinogradInverseTransformOp, DeviceType::GPU, half); -} - -} // namespace ops -} // namespace mace diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc deleted file mode 100644 index bb6679bbe12d147fa3842369cb81071f746f970d..0000000000000000000000000000000000000000 --- a/mace/ops/winograd_transform_benchmark.cc +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/core/testing/test_benchmark.h" -#include "mace/ops/ops_test_util.h" - -namespace mace { -namespace ops { -namespace test { - -namespace { -template -void BMWinogradTransform( - int iters, int batch, int height, int width, int channels, int block_size) { - mace::testing::StopTiming(); - - OpsTestNet net; - net.AddRandomInput("Input", {batch, height, width, channels}); - - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_CHANNEL); - OpDefBuilder("WinogradTransform", "WinogradTransformTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("block_size", block_size) - .Finalize(net.NewOperatorDef()); - - net.Setup(D); - // Warm-up - for (int i = 0; i < 5; ++i) { - net.Run(); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.Run(); - } - net.Sync(); -} -} // namespace - -#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \ - static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\ - DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMWinogradTransform(iters, N, H, W, C, M); \ - } \ - MACE_BENCHMARK( \ - MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE) - -#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \ - MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU); - -MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2); -MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2); -MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2); -MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4); -MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4); -MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4); - -namespace { -template -void BMWinogradInverseTransform( - int iters, int batch, int height, int width, int channels, int block_size) { - mace::testing::StopTiming(); - - index_t p = batch * ((height + block_size - 1) / block_size) * - ((width + block_size - 1) / block_size); - OpsTestNet net; - net.AddRandomInput("Input", {(block_size + 2) * - (block_size + 2), channels, p, 1}); - - BufferToImage(&net, "Input", "InputImage", - ops::BufferType::IN_OUT_HEIGHT); - OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") - .Input("InputImage") - .AddIntArg("batch", batch) - .AddIntArg("height", height) - .AddIntArg("width", width) - .AddIntArg("block_size", block_size) - .Output("OutputImage") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - net.Setup(D); - - // Warm-up - for (int i = 0; i < 5; ++i) { - net.Run(); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.Run(); - } - net.Sync(); -} -} // namespace - -#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \ - static void \ - MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\ - ##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMWinogradInverseTransform(iters, N, H, W, C, M); \ - } \ - MACE_BENCHMARK( \ - MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\ - DEVICE) - -#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \ - MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU); - -MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2); -MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2); -MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2); - -MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4); -MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4); -MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4); - -namespace { -template -void WinoFilterBufferToImage(int iters, - int out_channel, int in_channel, - int height, int width, int wino_block_size) { - mace::testing::StopTiming(); - - OpsTestNet net; - - // Add input data - net.AddRandomInput("Input", - {out_channel, in_channel, height, width}); - - OpDefBuilder("BufferToImage", "BufferToImageTest") - .Input("Input") - .Output("Output") - .AddIntArg("buffer_type", ops::BufferType::WINOGRAD_FILTER) - .AddIntArg("wino_block_size", wino_block_size) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - - // Warm-up - net.Setup(D); - for (int i = 0; i < 5; ++i) { - net.Run(); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.Run(); - } - net.Sync(); -} -} // namespace - -#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE) \ - static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\ - int iters) { \ - const int64_t tot = static_cast(iters) * O * I * H * W; \ - mace::testing::MaccProcessed(tot); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - WinoFilterBufferToImage(iters, O, I, H, W, M); \ - } \ - MACE_BENCHMARK(\ - MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE) - -#define MACE_BM_WINO_B2I(O, I, H, W, M) \ - MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU); - -MACE_BM_WINO_B2I(16, 3, 3, 3, 2); -MACE_BM_WINO_B2I(16, 3, 3, 3, 4); -MACE_BM_WINO_B2I(32, 3, 3, 3, 2); -MACE_BM_WINO_B2I(32, 3, 3, 3, 4); -MACE_BM_WINO_B2I(128, 3, 3, 3, 2); -MACE_BM_WINO_B2I(128, 3, 3, 3, 4); -MACE_BM_WINO_B2I(256, 3, 3, 3, 2); -MACE_BM_WINO_B2I(256, 3, 3, 3, 4); - -namespace { -template -void WinoMatMulBenchmark( - int iters, int out_channels, int in_channels, - int height, int width, int block_size) { - mace::testing::StopTiming(); - - OpsTestNet net; - const int batch = (block_size + 2) * (block_size + 2); - const index_t round_h = (height + block_size - 1) / block_size; - const index_t round_w = (width + block_size - 1) / block_size; - const index_t out_width = round_h * round_w; - // Add input data - net.AddRandomInput("A", {batch, out_channels, in_channels}); - net.AddRandomInput("B", {batch, in_channels, out_width}); - - if (D == DeviceType::GPU) { - BufferToImage(&net, "A", "AImage", ops::BufferType::IN_OUT_WIDTH); - BufferToImage(&net, "B", "BImage", - ops::BufferType::IN_OUT_HEIGHT); - - OpDefBuilder("MatMul", "MatMulBM") - .Input("AImage") - .Input("BImage") - .Output("Output") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); - } else { - OpDefBuilder("MatMul", "MatMulBM") - .Input("A") - .Input("B") - .Output("Output") - .Finalize(net.NewOperatorDef()); - } - net.Setup(D); - // Warm-up - for (int i = 0; i < 5; ++i) { - net.Run(); - } - net.Sync(); - - mace::testing::StartTiming(); - while (iters--) { - net.Run(); - } - net.Sync(); -} -} // namespace - -#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE) \ - static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\ - DEVICE(int iters) { \ - const int64_t macc = static_cast(iters) * OC * IC * H * W; \ - const int64_t tot = static_cast(iters) * OC * (IC * H + H * W); \ - mace::testing::MaccProcessed(macc); \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - WinoMatMulBenchmark(iters, OC, IC, H, W, M); \ - } \ - MACE_BENCHMARK(\ - MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE) - -#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M) \ - MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU); - -MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2); -MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4); -MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2); -MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4); - -} // namespace test -} // namespace ops -} // namespace mace diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto index a3064df0f2a945c0b960e7c6b55bff103c71519a..530de3aedfcd6a94d9ee840f8e368a4447d6cd8c 100644 --- a/mace/proto/mace.proto +++ b/mace/proto/mace.proto @@ -7,11 +7,6 @@ option optimize_for = LITE_RUNTIME; // For better compatibility, // the mace.proto is refered from tensorflow and caffe2. -enum NetMode { - INIT = 0; - NORMAL = 1; -} - enum DataType { DT_INVALID = 0; DT_FLOAT = 1; @@ -90,18 +85,6 @@ message OperatorDef { repeated int32 out_max_byte_size = 104; // only support 32-bit len } -// for memory optimization -message MemoryBlock { - optional int32 mem_id = 1; - optional int32 device_type = 2; - optional MemoryType mem_type = 3; - optional uint32 x = 4; - optional uint32 y = 5; -} -message MemoryArena { - repeated MemoryBlock mem_block = 1; -} - // for hexagon mace-nnlib message InputInfo { optional string name = 1; @@ -109,6 +92,7 @@ message InputInfo { repeated int32 dims = 3; optional int32 max_byte_size = 4; // only support 32-bit len optional DataType data_type = 5 [default = DT_FLOAT]; + optional int32 data_format = 6 [default = 1]; // NHWC } message OutputInfo { optional string name = 1; @@ -116,6 +100,7 @@ message OutputInfo { repeated int32 dims = 3; optional int32 max_byte_size = 4; // only support 32-bit len optional DataType data_type = 5 [default = DT_FLOAT]; + optional int32 data_format = 6 [default = 1]; // NHWC } message NetDef { @@ -123,9 +108,6 @@ message NetDef { repeated Argument arg = 2; repeated ConstTensor tensors = 3; - // for mem optimization - optional MemoryArena mem_arena = 10; - // for hexagon mace-nnlib repeated InputInfo input_info = 100; repeated OutputInfo output_info = 101; diff --git a/mace/public/mace.h b/mace/public/mace.h index 9e7f568638cc71a9cf358f141b4c0ed46853ab34..01818ef5719b48298bd501967bb91cb99521336f 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -34,6 +34,8 @@ class NetDef; enum DeviceType { CPU = 0, GPU = 2, HEXAGON = 3 }; +enum DataFormat { DF_NONE = 0, NHWC = 1, NCHW = 2}; + enum GPUPerfHint { PERF_DEFAULT = 0, PERF_LOW = 1, @@ -259,7 +261,8 @@ class MACE_API MaceTensor { // data - the buffer of the tensor, must not be null with size equals // shape[0] * shape[1] * ... * shape[n-1] MaceTensor(const std::vector &shape, - std::shared_ptr data); + std::shared_ptr data, + const DataFormat format = DataFormat::NHWC); MaceTensor(); MaceTensor(const MaceTensor &other); MaceTensor(const MaceTensor &&other); @@ -270,6 +273,7 @@ class MACE_API MaceTensor { const std::vector &shape() const; const std::shared_ptr data() const; std::shared_ptr data(); + DataFormat data_format() const; private: class Impl; diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD index 693ed9dea2fecce8df3cdc246fa6d4ff87b47024..41f039476ee7f6b50a15ac8cac1dc30dc7738121 100644 --- a/mace/python/tools/BUILD +++ b/mace/python/tools/BUILD @@ -21,7 +21,6 @@ py_library( ], srcs_version = "PY2AND3", deps = [ - ":memory_optimizer", ":quantization_lib", "//mace/proto:mace_py", "//third_party/caffe:caffe_py", @@ -39,15 +38,6 @@ py_library( ], ) -py_binary( - name = "memory_optimizer", - srcs = ["memory_optimizer.py"], - srcs_version = "PY2AND3", - deps = [ - "//mace/proto:mace_py", - ], -) - py_binary( name = "converter", srcs = ["converter.py"], diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py index a89e3abdb1e4a75fdf3ee5489439cb7d89cbfcfd..790654f49472a7159740db72a2ec6c6ff6c9be52 100644 --- a/mace/python/tools/converter.py +++ b/mace/python/tools/converter.py @@ -21,7 +21,6 @@ import copy import six from mace.proto import mace_pb2 -from mace.python.tools import memory_optimizer from mace.python.tools import model_saver from mace.python.tools.converter_tool import base_converter as cvt from mace.python.tools.converter_tool import transformer @@ -36,7 +35,13 @@ FLAGS = None device_type_map = {'cpu': cvt.DeviceType.CPU.value, 'gpu': cvt.DeviceType.GPU.value, - 'dsp': cvt.DeviceType.HEXAGON.value} + 'dsp': cvt.DeviceType.HEXAGON.value, + 'cpu+gpu': cvt.DeviceType.CPU.value} + +data_format_map = { + 'NONE': cvt.DataFormat.DF_NONE, + 'NHWC': cvt.DataFormat.NHWC, +} def parse_data_type(data_type, device_type): @@ -117,6 +122,7 @@ def main(unused_args): input_node_names = FLAGS.input_node.split(',') input_node_shapes = FLAGS.input_shape.split(':') + input_node_formats = FLAGS.input_data_formats.split(",") if FLAGS.input_range: input_node_ranges = FLAGS.input_range.split(':') else: @@ -126,6 +132,10 @@ def main(unused_args): for i in six.moves.range(len(input_node_names)): input_node = cvt.NodeInfo() input_node.name = input_node_names[i] + if len(input_node_formats) == 1: + input_node.data_format = data_format_map[input_node_formats[0]] + else: + input_node.data_format = data_format_map[input_node_formats[i]] input_node.shape = parse_int_array_from_str(input_node_shapes[i]) if len(input_node_ranges) > i: input_node.range = parse_float_array_from_str(input_node_ranges[i]) @@ -133,11 +143,16 @@ def main(unused_args): output_node_names = FLAGS.output_node.split(',') output_node_shapes = FLAGS.output_shape.split(':') + output_node_formats = FLAGS.output_data_formats.split(",") if len(output_node_names) != len(output_node_shapes): raise Exception('output node count and shape count do not match.') for i in six.moves.range(len(output_node_names)): output_node = cvt.NodeInfo() output_node.name = output_node_names[i] + if len(output_node_formats) == 1: + output_node.data_format = data_format_map[output_node_formats[0]] + else: + output_node.data_format = data_format_map[output_node_formats[i]] output_node.shape = parse_int_array_from_str(output_node_shapes[i]) option.add_output_node(output_node) @@ -179,74 +194,25 @@ def main(unused_args): output_graph_def = converter.run() - if FLAGS.runtime == 'cpu+gpu': - cpu_graph_def = copy.deepcopy(output_graph_def) - - option.device = cvt.DeviceType.GPU.value - option.data_type = parse_data_type( - FLAGS.data_type, cvt.DeviceType.GPU.value) - mace_gpu_transformer = transformer.Transformer( - option, output_graph_def) - output_graph_def, _ = mace_gpu_transformer.run() - six.print_("start optimize gpu memory.") - memory_optimizer.optimize_gpu_memory(output_graph_def) - six.print_("GPU memory optimization done.") - - option.device = cvt.DeviceType.CPU.value - option.data_type = parse_data_type( - FLAGS.data_type, cvt.DeviceType.CPU.value) - option.disable_transpose_filters() - mace_cpu_transformer = transformer.Transformer( - option, cpu_graph_def) - cpu_graph_def, _ = mace_cpu_transformer.run() - print("start optimize cpu memory.") - memory_optimizer.optimize_cpu_memory(cpu_graph_def) - print("CPU memory optimization done.") - - print("Merge cpu and gpu ops together") - output_graph_def.op.extend(cpu_graph_def.op) - output_graph_def.mem_arena.mem_block.extend( - cpu_graph_def.mem_arena.mem_block) - output_graph_arg_names = set() - for arg in output_graph_def.arg: - output_graph_arg_names.add(arg.name) - - for arg in cpu_graph_def.arg: - if arg.name not in output_graph_arg_names: - output_graph_def.arg.extend(arg) - print("Merge done") - else: - option.device = device_type_map[FLAGS.runtime] - option.data_type = parse_data_type( - FLAGS.data_type, option.device) - mace_transformer = transformer.Transformer( - option, output_graph_def) - output_graph_def, quantize_activation_info = mace_transformer.run() - - if FLAGS.runtime == 'dsp': - from mace.python.tools.converter_tool import hexagon_converter - converter = hexagon_converter.HexagonConverter( - option, output_graph_def, quantize_activation_info) - output_graph_def = converter.run() - - print("start optimize memory.") - if FLAGS.runtime == 'gpu': - memory_optimizer.optimize_gpu_memory(output_graph_def) - elif FLAGS.runtime == 'cpu': - memory_optimizer.optimize_cpu_memory(output_graph_def) - elif FLAGS.runtime == 'dsp': - pass - else: - mace_check(False, "runtime only support [gpu|cpu|dsp]") - - print("Memory optimization done.") + option.device = device_type_map[FLAGS.runtime] + option.data_type = parse_data_type( + FLAGS.data_type, option.device) + mace_transformer = transformer.Transformer( + option, output_graph_def) + output_graph_def, quantize_activation_info = mace_transformer.run() + + if FLAGS.runtime == 'dsp': + from mace.python.tools.converter_tool import hexagon_converter + converter = hexagon_converter.HexagonConverter( + option, output_graph_def, quantize_activation_info) + output_graph_def = converter.run() model_saver.save_model( - output_graph_def, model_checksum, weight_checksum, + option, output_graph_def, model_checksum, weight_checksum, FLAGS.template_dir, FLAGS.obfuscate, FLAGS.model_tag, - FLAGS.output_dir, FLAGS.runtime, + FLAGS.output_dir, FLAGS.embed_model_data, - FLAGS.winograd, FLAGS.data_type, + FLAGS.winograd, FLAGS.model_graph_format) @@ -293,8 +259,18 @@ def parse_args(): type=str, default="input_node", help="e.g., input_node") + parser.add_argument( + "--input_data_formats", + type=str, + default="NHWC", + help="e.g., NHWC,NONE") parser.add_argument( "--output_node", type=str, default="softmax", help="e.g., softmax") + parser.add_argument( + "--output_data_formats", + type=str, + default="NHWC", + help="e.g., NHWC,NONE") parser.add_argument( "--check_node", type=str, default="softmax", help="e.g., softmax") parser.add_argument( diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 6b5d227eb5e5e6967b7510602e7d03fd9ef033c4..3f8d7164b64ec4d64253f8562b54e9e7b31f377d 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -25,15 +25,16 @@ class DeviceType(Enum): class DataFormat(Enum): - NHWC = 0 - NCHW = 1 + DF_NONE = 0 + NHWC = 1 + NCHW = 2 class FilterFormat(Enum): - HWIO = 0 - OIHW = 1 - HWOI = 2 - OHWI = 3 + HWIO = 100 + OIHW = 101 + HWOI = 102 + OHWI = 103 class PaddingMode(Enum): @@ -113,7 +114,6 @@ MaceSupportedOps = [ 'ResizeBilinear', 'Reverse', 'ScalarMath', - 'Slice', 'Split', 'Shape', 'Squeeze', @@ -137,9 +137,6 @@ class MaceKeyword(object): mace_input_node_name = 'mace_input_node' mace_output_node_name = 'mace_output_node' mace_buffer_type = 'buffer_type' - mace_mode = 'mode' - mace_buffer_transform = 'BufferTransform' - mace_buffer_inverse_transform = 'BufferInverseTransform' # arg related str mace_padding_str = 'padding' mace_padding_values_str = 'padding_values' @@ -185,6 +182,8 @@ class MaceKeyword(object): mace_opencl_mem_type = "opencl_mem_type" mace_framework_type_str = "framework_type" mace_group_str = "group" + mace_wino_arg_str = "wino_block_size" + mace_quantize_flag_arg_str = "quantize_flag" class TransformerRule(Enum): @@ -195,7 +194,7 @@ class TransformerRule(Enum): FOLD_BATCHNORM = 5 FOLD_CONV_AND_BN = 6 FOLD_DEPTHWISE_CONV_AND_BN = 7 - TRANSFORM_GPU_WINOGRAD = 8 + ADD_WINOGRAD_ARG = 8 TRANSFORM_ADD_TO_BIASADD = 9 FOLD_BIASADD = 10 FLATTEN_ATROUS_CONV = 11 @@ -238,6 +237,7 @@ class NodeInfo(object): def __init__(self): self._name = None self._shape = [] + self._data_format = DataFormat.NHWC self._range = [-1.0, 1.0] @property @@ -248,6 +248,10 @@ class NodeInfo(object): def shape(self): return self._shape + @property + def data_format(self): + return self._data_format + @property def range(self): return self._range @@ -260,6 +264,10 @@ class NodeInfo(object): def shape(self, shape): self._shape = shape + @data_format.setter + def data_format(self, data_format): + self._data_format = data_format + @range.setter def range(self, range): self._range = range @@ -410,7 +418,6 @@ class ConverterOption(object): TransformerRule.FOLD_CONV_AND_BN, TransformerRule.FOLD_DECONV_AND_BN, TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN, - TransformerRule.TRANSFORM_GPU_WINOGRAD, TransformerRule.TRANSFORM_ADD_TO_BIASADD, TransformerRule.REARRANGE_BATCH_TO_SPACE, TransformerRule.FOLD_BIASADD, @@ -422,16 +429,14 @@ class ConverterOption(object): # Model data format related transformation TransformerRule.TRANSPOSE_FILTERS, TransformerRule.TRANSPOSE_DATA_FORMAT, + # Add winograd argument + TransformerRule.ADD_WINOGRAD_ARG, # Mace model structure related transformation TransformerRule.ADD_IN_OUT_TENSOR_INFO, - # Device related transformation - TransformerRule.ADD_BUFFER_TRANSFORM, - TransformerRule.ADD_DEVICE, # Data type related transformation TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE, # Transform finalization TransformerRule.ADD_OPENCL_INFORMATIONS, - TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES, # for quantization entropy calibration use TransformerRule.SORT_BY_EXECUTION, # Need to be put after SORT_BY_EXECUTION diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 7769b81e5f83e38add2289d564fdbb39c47b929b..d736719355d80df993f08c395f3495fb592cc993 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -13,7 +13,6 @@ # limitations under the License. -import enum import re import numpy as np @@ -21,7 +20,6 @@ import six from mace.proto import mace_pb2 from mace.python.tools.converter_tool import base_converter -from mace.python.tools.converter_tool.base_converter import ActivationType from mace.python.tools.converter_tool.base_converter import ConverterUtil from mace.python.tools.converter_tool.base_converter import DataFormat from mace.python.tools.converter_tool.base_converter import DeviceType @@ -32,13 +30,9 @@ from mace.python.tools.converter_tool.base_converter import MaceKeyword from mace.python.tools.converter_tool.base_converter import MaceOp from mace.python.tools.converter_tool.base_converter import PaddingMode from mace.python.tools.converter_tool.base_converter import TransformerRule -from mace.python.tools.convert_util import calculate_image_shape from mace.python.tools.convert_util import mace_check -from mace.python.tools.convert_util import OpenCLBufferType from mace.python.tools.quantization import quantize_util -OPENCL_IMAGE_MAX_SIZE = 16384 - class Transformer(base_converter.ConverterInterface): """A class for transform naive mace model to optimized model. @@ -69,8 +63,6 @@ class Transformer(base_converter.ConverterInterface): self.fold_deconv_and_bn, # data_format related TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN: self.fold_depthwise_conv_and_bn, # data_format related - TransformerRule.TRANSFORM_GPU_WINOGRAD: - self.transform_gpu_winograd, # data_format related TransformerRule.TRANSFORM_ADD_TO_BIASADD: self.transform_add_to_biasadd, TransformerRule.REARRANGE_BATCH_TO_SPACE: @@ -84,25 +76,20 @@ class Transformer(base_converter.ConverterInterface): TransformerRule.TRANSPOSE_MATMUL_WEIGHT: self.transpose_matmul_weight, TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format, + TransformerRule.ADD_WINOGRAD_ARG: self.add_winograd_arg, TransformerRule.ADD_IN_OUT_TENSOR_INFO: self.add_in_out_tensor_info, TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC: self.transform_global_conv_to_fc, TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight, - TransformerRule.ADD_BUFFER_TRANSFORM: - self.add_buffer_transform, TransformerRule.QUANTIZE_NODES: self.quantize_nodes, TransformerRule.ADD_QUANTIZE_TENSOR_RANGE: self.add_quantize_tensor_range, TransformerRule.QUANTIZE_WEIGHTS: self.quantize_weights, - TransformerRule.ADD_DEVICE: - self.add_device, TransformerRule.UPDATE_FLOAT_OP_DATA_TYPE: self.update_float_op_data_type, - TransformerRule.ADD_MACE_INPUT_AND_OUTPUT_NODES: - self.add_mace_input_and_output_nodes, TransformerRule.ADD_OPENCL_INFORMATIONS: self.add_opencl_informations, TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution, @@ -112,28 +99,22 @@ class Transformer(base_converter.ConverterInterface): self._option = option self._model = model - self._gpu_wino_blk = self._option.winograd + self._wino_arg = self._option.winograd self._ops = {} self._consts = {} self._consumers = {} self._producer = {} self._target_data_format = DataFormat.NHWC - self._input_output_added = False - self._opencl_max_image_size = [0, 0] self._output_op_names = set() self._quantize_activation_info = {} self._quantized_tensor = set() - if self._option.device == DeviceType.CPU.value and \ - not self._option.quantize: - self._target_data_format = DataFormat.NCHW - def run(self): for key in self._option.transformer_option: transformer = self._registered_transformers[key] while True: - self.construct_ops_and_consumers() + self.construct_ops_and_consumers(key) changed = transformer() if not changed: break @@ -162,7 +143,7 @@ class Transformer(base_converter.ConverterInterface): MaceKeyword.mace_filter_format_str) arg.i = filter_format.value - def construct_ops_and_consumers(self): + def construct_ops_and_consumers(self, key): self._ops.clear() self._consumers.clear() self._producer.clear() @@ -178,27 +159,28 @@ class Transformer(base_converter.ConverterInterface): for output_tensor in op.output: self._producer[output_tensor] = op - for input_node in self._option.input_nodes.values(): - input_node_existed = False - for op in self._model.op: - if input_node.name in op.output: - input_node_existed = True - break - if not input_node_existed: - op = mace_pb2.OperatorDef() - op.name = self.normalize_op_name(input_node.name) - op.type = 'Input' - op.output.extend([input_node.name]) - output_shape = op.output_shape.add() - output_shape.dims.extend(input_node.shape) - if ConverterUtil.data_format( - self._consumers[input_node.name][0]) \ - == DataFormat.NCHW: - self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) - ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) - else: - ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) - self._producer[op.output[0]] = op + if key != TransformerRule.SORT_BY_EXECUTION: + for input_node in self._option.input_nodes.values(): + input_node_existed = False + for op in self._model.op: + if input_node.name in op.output: + input_node_existed = True + break + if not input_node_existed: + op = mace_pb2.OperatorDef() + op.name = self.normalize_op_name(input_node.name) + op.type = "Input" + op.output.extend([input_node.name]) + output_shape = op.output_shape.add() + output_shape.dims.extend(input_node.shape) + if ConverterUtil.data_format( + self._consumers[input_node.name][0]) \ + == DataFormat.NCHW: + self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) + ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + else: + ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) + self._producer[op.output[0]] = op @staticmethod def replace(obj_list, source, target): @@ -288,21 +270,17 @@ class Transformer(base_converter.ConverterInterface): for input_node in self._option.input_nodes.values(): input_info = net.input_info.add() input_info.name = input_node.name + input_info.data_format = input_node.data_format.value input_info.dims.extend(input_node.shape) - if self._option.quantize: - input_info.data_type = mace_pb2.DT_FLOAT - else: - input_info.data_type = self._option.data_type + input_info.data_type = mace_pb2.DT_FLOAT for output_node in self._option.output_nodes.values(): output_info = net.output_info.add() output_info.name = output_node.name + output_info.data_format = output_node.data_format.value output_info.dims.extend( self._producer[output_node.name].output_shape[0].dims) - if self._option.quantize: - output_info.data_type = mace_pb2.DT_FLOAT - else: - output_info.data_type = self._option.data_type + output_info.data_type = mace_pb2.DT_FLOAT return False @@ -725,173 +703,6 @@ class Transformer(base_converter.ConverterInterface): mace_check(False, "filter format %s not supported" % filter_format) return filter_height, filter_width, in_channels, out_channels - def check_if_gpu_use_winograd_conv(self, op): - if not self._option.winograd: - return False - if op.type != MaceOp.Conv2D.name: - return False - - filter_shape = self._consts[op.input[1]].dims - output_shape = op.output_shape[0].dims - strides = ConverterUtil.get_arg(op, MaceKeyword.mace_strides_str).ints - dilations_arg = ConverterUtil.get_arg(op, - MaceKeyword.mace_dilations_str) - if dilations_arg is None: - dilations = [1, 1] - else: - dilations = dilations_arg.ints - filter_height, filter_width, in_channels, out_channels = \ - Transformer.sort_filter_shape(filter_shape, self.filter_format()) - batch, out_height, out_width, _ = Transformer.sort_feature_map_shape( - output_shape, ConverterUtil.data_format(op)) - - if filter_height != 3 or filter_width != 3 or strides[0] > 1 \ - or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1: - return False - self._gpu_wino_blk = self._option.winograd - block_size = self._option.winograd - blk_sqr = (block_size + 2) * (block_size + 2) - width =\ - batch * ((out_height + block_size - 1) / block_size) *\ - ((out_width + block_size - 1) / block_size) - if blk_sqr * in_channels >= OPENCL_IMAGE_MAX_SIZE \ - or blk_sqr * out_channels >= OPENCL_IMAGE_MAX_SIZE \ - or width >= OPENCL_IMAGE_MAX_SIZE: - self._gpu_wino_blk = 2 - block_size = self._gpu_wino_blk - blk_sqr = (block_size + 2) * (block_size + 2) - width = \ - batch * ((out_height + block_size - 1) / block_size) * \ - ((out_width + block_size - 1) / block_size) - return (blk_sqr * in_channels < OPENCL_IMAGE_MAX_SIZE) and \ - (blk_sqr * out_channels < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) - - def transform_gpu_winograd(self): - """Only gpu needs winograd transform.""" - net = self._model - filter_format = self.filter_format() - if self._option.device == DeviceType.GPU.value: - for op in net.op: - if op.type == MaceOp.Conv2D.name \ - and self.check_if_gpu_use_winograd_conv(op): - print("Transform gpu winograd %s(%s)" % (op.name, op.type)) - block_size = self._gpu_wino_blk - blk_sqr = (block_size + 2) * (block_size + 2) - output_shape = op.output_shape[0].dims - filter = self._consts[op.input[1]] - filter_shape = filter.dims - data_format = ConverterUtil.data_format(op) - filter_height, filter_width, in_channels, out_channels = \ - Transformer.sort_filter_shape(filter_shape, - filter_format) - batch, out_height, out_width, _ = \ - Transformer.sort_feature_map_shape(output_shape, - data_format) - - # Input transform - wt_op = net.op.add() - wt_op.name = op.name + '_input_transform' - wt_op.type = MaceOp.WinogradTransform.name - wt_op.input.extend([op.input[0]]) - wt_op.output.extend([wt_op.name]) - wt_output_shape = wt_op.output_shape.add() - wt_output_width =\ - batch * ((out_height + block_size - 1) / block_size) *\ - ((out_width + block_size - 1) / block_size) - wt_output_shape.dims.extend( - [blk_sqr, in_channels, wt_output_width]) - - blk_size_arg = wt_op.arg.add() - blk_size_arg.name = MaceKeyword.mace_wino_block_size - blk_size_arg.i = block_size - - if ConverterUtil.get_arg(op, - MaceKeyword.mace_padding_str) \ - is not None: - padding_arg = wt_op.arg.add() - padding_arg.name = MaceKeyword.mace_padding_str - padding_arg.i = ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_str).i - elif ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_values_str) \ - is not None: - padding_arg = wt_op.arg.add() - padding_arg.name = MaceKeyword.mace_padding_values_str - padding_arg.ints.extend(ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_values_str).ints) - - # MatMul - matmul_op = net.op.add() - matmul_op.name = op.name + '_matmul' - matmul_op.type = MaceOp.MatMul.name - matmul_op.input.extend([op.input[1], wt_op.output[0]]) - matmul_op.output.extend([matmul_op.name]) - matmul_output_shape = matmul_op.output_shape.add() - matmul_output_shape.dims.extend( - [blk_sqr, out_channels, wt_output_width]) - - arg = matmul_op.arg.add() - arg.name = MaceKeyword.mace_winograd_filter_transformed - arg.i = 1 - - shape_op = net.op.add() - shape_op.name = op.name + '_infer_shape' - shape_op.type = MaceOp.InferConv2dShape.name - shape_op.input.extend([op.input[0]]) - shape_op.output.extend([shape_op.name]) - shape_output_shape = shape_op.output_shape.add() - shape_output_shape.dims.extend([4]) - - kernels_arg = shape_op.arg.add() - kernels_arg.name = MaceKeyword.mace_kernel_str - kernels_arg.ints.extend([out_channels, - in_channels, - filter_height, - filter_width]) - - if data_format is not None: - data_format_arg = shape_op.arg.add() - data_format_arg.name = MaceKeyword.mace_data_format_str - data_format_arg.i = data_format.value - - if ConverterUtil.get_arg(op, - MaceKeyword.mace_padding_str) \ - is not None: - padding_arg = shape_op.arg.add() - padding_arg.name = MaceKeyword.mace_padding_str - padding_arg.i = ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_str).i - elif ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_values_str) \ - is not None: - padding_arg = shape_op.arg.add() - padding_arg.name = MaceKeyword.mace_padding_values_str - padding_arg.ints.extend(ConverterUtil.get_arg( - op, MaceKeyword.mace_padding_values_str).ints) - - # Inverse transform - iwt_op = net.op.add() - iwt_op.name = op.name + '_inverse_transform' - iwt_op.type = MaceOp.WinogradInverseTransform.name - iwt_op.input.extend([matmul_op.output[0]]) - iwt_op.input.extend([shape_op.output[0]]) - # biasadd - if len(op.input) >= 3: - iwt_op.input.extend([op.input[2]]) - iwt_op.output.extend(op.output) - iwt_output_shape = iwt_op.output_shape.add() - iwt_output_shape.dims.extend(op.output_shape[0].dims) - - blk_size_arg = iwt_op.arg.add() - blk_size_arg.name = MaceKeyword.mace_wino_block_size - blk_size_arg.i = block_size - ConverterUtil.add_data_format_arg(iwt_op, data_format) - - self.safe_remove_node(op, iwt_op) - - return False - def transform_add_to_biasadd(self): net = self._model for op in net.op: @@ -1105,37 +916,25 @@ class Transformer(base_converter.ConverterInterface): if arg.name == MaceKeyword.mace_paddings_str: mace_check(len(arg.ints) == 8, "pad dim rank should be 8.") - if ConverterUtil.data_format(op) == DataFormat.NHWC \ - and self._target_data_format == DataFormat.NCHW: # noqa - print("Transpose pad args: %s(%s)" - % (op.name, op.type)) - self.transpose_shape(arg.ints, - [0, 1, 6, 7, 2, 3, 4, 5]) - elif ConverterUtil.data_format(op) == DataFormat.NCHW \ + if ConverterUtil.data_format(op) == DataFormat.NCHW \ and self._target_data_format == DataFormat.NHWC: # noqa print("Transpose pad args: %s(%s)" % (op.name, op.type)) self.transpose_shape(arg.ints, [0, 1, 4, 5, 6, 7, 2, 3]) - elif op.type == MaceOp.Concat.name or op.type == MaceOp.Slice.name: + elif op.type == MaceOp.Concat.name or op.type == MaceOp.Split.name: for arg in op.arg: if arg.name == MaceKeyword.mace_axis_str: - if ConverterUtil.data_format(op) == DataFormat.NHWC \ - and self._target_data_format == DataFormat.NCHW: # noqa - print("Transpose slice args: %s(%s)" - % (op.name, op.type)) - mace_check(arg.i == 3, - 'only support concat at ' - 'channel dimension') - arg.i = 1 - elif ConverterUtil.data_format(op) == DataFormat.NCHW \ + if ConverterUtil.data_format(op) == DataFormat.NCHW \ and self._target_data_format == DataFormat.NHWC: # noqa - print("Transpose slice args: %s(%s)" + print("Transpose concat/split args: %s(%s)" % (op.name, op.type)) - mace_check(arg.i == 1, - "only support concat at " - "channel dimension") - arg.i = 3 + if arg.i == 1: + arg.i = 3 + elif arg.i == 2: + arg.i = 1 + elif arg.i == 3: + arg.i = 2 producer = self._producer[op.input[0]] input_shape = producer.output_shape[0].dims @@ -1150,17 +949,7 @@ class Transformer(base_converter.ConverterInterface): elif op.type == MaceOp.Squeeze.name: for arg in op.arg: if arg.name == MaceKeyword.mace_axis_str: - if ConverterUtil.data_format( - op) == DataFormat.NHWC \ - and self._target_data_format == DataFormat.NCHW: # noqa - print("Transpose squeeze args: %s(%s)" - % (op.name, op.type)) - mace_check(list(arg.ints) == [1, 2], - 'only support squeeze at at [1, 2]') - arg.ints[:] = [2, 3] - elif ConverterUtil.data_format( - op) == DataFormat.NCHW \ - and self._target_data_format == DataFormat.NHWC: # noqa + if ConverterUtil.data_format(op) == DataFormat.NCHW: print("Transpose squeeze args: %s(%s)" % (op.name, op.type)) mace_check(list(arg.ints) == [2, 3], @@ -1171,24 +960,6 @@ class Transformer(base_converter.ConverterInterface): for arg in op.arg: if arg.name == MaceKeyword.mace_axis_str: if ConverterUtil.data_format( - op) == DataFormat.NHWC \ - and self._target_data_format == DataFormat.NCHW: # noqa - print("Transpose reduce mean args: %s(%s)" - % (op.name, op.type)) - reduce_axises = list(arg.ints) - new_axises = [] - for i in range(len(reduce_axises)): - idx = reduce_axises[i] - if idx == 1 or idx == 2: - new_axises.append(idx + 1) - elif idx == 3: - new_axises.append(1) - else: - new_axises.append(idx) - new_axises.sort() - arg.ints[:] = [] - arg.ints.extend(new_axises) - elif ConverterUtil.data_format( op) == DataFormat.NCHW \ and self._target_data_format == DataFormat.NHWC: # noqa print("Transpose reduce mean args: %s(%s)" @@ -1212,69 +983,26 @@ class Transformer(base_converter.ConverterInterface): if data_format is not None \ and data_format != self._target_data_format: print("Transpose output shapes: %s(%s)" % (op.name, op.type)) - if self._target_data_format == DataFormat.NHWC: # NCHW -> NHWC - for output_shape in op.output_shape: - if len(output_shape.dims) == 4: - self.transpose_shape(output_shape.dims, - [0, 2, 3, 1]) - else: # NHWC -> NCHW - for output_shape in op.output_shape: - if len(output_shape.dims) == 4: - self.transpose_shape(output_shape.dims, - [0, 3, 1, 2]) + for output_shape in op.output_shape: + if len(output_shape.dims) == 4: + self.transpose_shape(output_shape.dims, + [0, 2, 3, 1]) ConverterUtil.get_arg(op, MaceKeyword.mace_data_format_str).i = \ self._target_data_format.value - # transpose input/output - if self._target_data_format == DataFormat.NCHW: - print("Transpose input/output to NCHW") - for input_node in self._option.input_nodes.values(): - new_input_name = MaceKeyword.mace_input_node_name \ - + '_' + input_node.name - op = net.op.add() - op.name = self.normalize_op_name(input_node.name) - op.input.extend([new_input_name]) - op.output.extend([input_node.name]) - output_shape = op.output_shape.add() - output_shape.dims.extend(input_node.shape) - if len(output_shape.dims) == 4: - op.type = MaceOp.Transpose.name - self.transpose_shape(output_shape.dims, [0, 3, 1, 2]) - - dims_arg = op.arg.add() - dims_arg.name = MaceKeyword.mace_dims_str - dims_arg.ints.extend([0, 3, 1, 2]) - else: - op.type = MaceOp.Identity.name - - ConverterUtil.add_data_type_arg(op, mace_pb2.DT_FLOAT) - ConverterUtil.add_data_format_arg(op, DataFormat.NCHW) + return False - for output_node in self._option.output_nodes.values(): - output_name = MaceKeyword.mace_output_node_name \ - + '_' + output_node.name - op = self._model.op.add() - op.name = self.normalize_op_name(output_name) - op.input.extend([output_node.name]) - op.output.extend([output_name]) - output_shape = op.output_shape.add() - output_shape.dims.extend( - self._producer[output_node.name].output_shape[0].dims) - if len(output_shape.dims) == 4: - op.type = MaceOp.Transpose.name - self.transpose_shape(output_shape.dims, [0, 2, 3, 1]) - - dims_arg = op.arg.add() - dims_arg.name = MaceKeyword.mace_dims_str - dims_arg.ints.extend([0, 2, 3, 1]) - - ConverterUtil.add_data_format_arg(op, DataFormat.NHWC) - else: - op.type = MaceOp.Identity.name - ConverterUtil.add_data_type_arg(op, mace_pb2.DT_FLOAT) + def add_winograd_arg(self): + if self._wino_arg == 0: + return False + net = self._model - self._input_output_added = True + for op in net.op: + if op.type == MaceOp.Conv2D.name: + winograd_arg = op.arg.add() + winograd_arg.name = MaceKeyword.mace_wino_arg_str + winograd_arg.i = self._wino_arg return False @@ -1400,168 +1128,6 @@ class Transformer(base_converter.ConverterInterface): return False - def buffer_transform(self, op, input_idx, input_type): - net = self._model - input_name = op.input[input_idx] - op_def = net.op.add() - op_def.name = input_name.replace(':', '_') + "_b2i" - output_name = op_def.name - op_def.type = MaceKeyword.mace_buffer_transform - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = MaceKeyword.mace_buffer_type - arg.i = input_type.value - arg = op_def.arg.add() - arg.name = MaceKeyword.mace_mode - arg.i = 0 - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) - - tensor_shape = list(self._consts[input_name].dims) - if input_type == OpenCLBufferType.WINOGRAD_FILTER: - blk_sqr = op.output_shape[0].dims[0] - wino_blk = int(np.sqrt(blk_sqr)) - 2 - wino_arg = op_def.arg.add() - wino_arg.name = MaceKeyword.mace_wino_block_size - wino_arg.i = wino_blk - img_shape = calculate_image_shape(input_type, tensor_shape, - wino_blk) - else: - img_shape = calculate_image_shape(input_type, tensor_shape) - - op.input[input_idx] = output_name - - # update OpenCL max image size - self._opencl_max_image_size[0] = max(self._opencl_max_image_size[0], - img_shape[0]) - self._opencl_max_image_size[1] = max(self._opencl_max_image_size[1], - img_shape[1]) - - def add_buffer_transform(self): - if self._option.device != DeviceType.GPU.value: - return False - - print("Add buffer transform op") - - net = self._model - for op in net.op: - if op.type == MaceOp.Conv2D.name: - self.buffer_transform(op, 1, OpenCLBufferType.CONV2D_FILTER) - if len(op.input) >= 3: - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.Deconv2D.name\ - or op.type == MaceOp.DepthwiseDeconv2d.name: - if op.type == MaceOp.Deconv2D.name: - self.buffer_transform(op, 1, - OpenCLBufferType.CONV2D_FILTER) - elif op.type == MaceOp.DepthwiseDeconv2d.name: - self.buffer_transform(op, 1, - OpenCLBufferType.DW_CONV2D_FILTER) - if ConverterUtil.get_arg( - op, - MaceKeyword.mace_framework_type_str).i == \ - FrameworkType.CAFFE.value: - if len(op.input) >= 3: - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif len(op.input) >= 4: - self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.DepthwiseConv2d.name: - self.buffer_transform(op, 1, OpenCLBufferType.DW_CONV2D_FILTER) - if len(op.input) >= 3: - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.BiasAdd.name: - self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.Eltwise.name and len(op.input) == 2: - if op.input[0] in self._consts \ - and len(self._consts[op.input[0]].dims) == 1: - self.buffer_transform(op, 0, OpenCLBufferType.ARGUMENT) - if op.input[1] in self._consts \ - and len(self._consts[op.input[1]].dims) == 1: - self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.BatchNorm.name: - self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - if len(op.input) >= 4: - self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.MatMul.name and \ - ConverterUtil.get_arg( - op, - MaceKeyword.mace_winograd_filter_transformed - ) is not None: # noqa - self.buffer_transform(op, 0, OpenCLBufferType.WINOGRAD_FILTER) - elif op.type == MaceOp.WinogradInverseTransform.name \ - and len(op.input) >= 3: - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.FullyConnected.name: - self.buffer_transform(op, 1, OpenCLBufferType.WEIGHT_WIDTH) - if len(op.input) >= 3: - self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.Activation.name: - if ConverterUtil.get_arg( - op, - MaceKeyword.mace_activation_type_str - ).s == ActivationType.PRELU.name: # noqa - self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT) - elif op.type == MaceOp.LSTMCell.name: - if op.input[1] in self._consts: - self.buffer_transform(op, 1, - OpenCLBufferType.IN_OUT_CHANNEL) - self.buffer_transform(op, 2, OpenCLBufferType.IN_OUT_CHANNEL) - self.buffer_transform(op, 3, OpenCLBufferType.ARGUMENT) - if op.input[4] in self._consts: - self.buffer_transform(op, 4, - OpenCLBufferType.IN_OUT_CHANNEL) - - # Add OpenCL max image size - if self._option.cl_mem_type == "image": - arg = net.arg.add() - arg.name = MaceKeyword.mace_opencl_max_image_size - arg.ints.extend(self._opencl_max_image_size) - - for input_node in self._option.input_nodes.values(): - new_input_name = MaceKeyword.mace_input_node_name \ - + '_' + input_node.name - op_def = self._model.op.add() - - op_def.name = self.normalize_op_name(input_node.name) - op_def.type = MaceKeyword.mace_buffer_transform - op_def.input.extend([new_input_name]) - op_def.output.extend([input_node.name]) - output_shape = op_def.output_shape.add() - output_shape.dims.extend(input_node.shape) - - arg = op_def.arg.add() - arg.name = MaceKeyword.mace_buffer_type - arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value - - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) - ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) - - for output_node in self._option.output_nodes.values(): - output_name = MaceKeyword.mace_output_node_name \ - + '_' + output_node.name - op_def = self._model.op.add() - op_def.name = self.normalize_op_name(output_name) - op_def.type = MaceKeyword.mace_buffer_inverse_transform - op_def.input.extend([output_node.name]) - op_def.output.extend([output_name]) - if output_node.shape: - output_shape = op_def.output_shape.add() - output_shape.dims.extend(output_node.shape) - - arg = op_def.arg.add() - arg.name = MaceKeyword.mace_buffer_type - arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value - - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) - ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) - self._output_op_names.add(op_def.name) - - self._input_output_added = True - - return False - def fold_reshape(self): net = self._model for op in net.op: @@ -1654,37 +1220,33 @@ class Transformer(base_converter.ConverterInterface): return False - def add_device(self): - # TODO(liuqi) add device definition in OperatorDef - net = self._model - for op in net.op: - arg = op.arg.add() - arg.name = MaceKeyword.mace_device - arg.i = self._option.device - - return False - def update_float_op_data_type(self): if self._option.quantize: return print("update op with float data type") net = self._model + # TODO(liuqi): unify the data_type when CPU support half storage + data_type = self._option.data_type + if self._option.device == DeviceType.CPU.value: + data_type = mace_pb2.DT_HALF for op in net.op: data_type_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_op_data_type_str) if not data_type_arg: data_type_arg = op.arg.add() data_type_arg.name = MaceKeyword.mace_op_data_type_str - data_type_arg.i = self._option.data_type - elif data_type_arg.i != self._option.data_type \ + data_type_arg.i = data_type + elif data_type_arg.i != data_type \ and data_type_arg.i == mace_pb2.DT_FLOAT \ and op.name not in self._output_op_names: - data_type_arg.i = self._option.data_type + data_type_arg.i = data_type return False def sort_dfs(self, op, visited, sorted_nodes): + if op.name in visited: + return visited.update([op.name]) if len(op.input) > 0: for input_tensor in op.input: @@ -1695,40 +1257,6 @@ class Transformer(base_converter.ConverterInterface): self.sort_dfs(producer_op, visited, sorted_nodes) sorted_nodes.append(op) - def add_mace_input_and_output_nodes(self): - if self._input_output_added: - return - - print("add mace input and output nodes") - - for input_node in self._option.input_nodes.values(): - new_input_name = MaceKeyword.mace_input_node_name \ - + '_' + input_node.name - op_def = self._model.op.add() - op_def.name = self.normalize_op_name(input_node.name) - op_def.type = MaceOp.Identity.name - op_def.input.extend([new_input_name]) - op_def.output.extend([input_node.name]) - output_shape = op_def.output_shape.add() - output_shape.dims.extend(input_node.shape) - - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) - ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) - - for output_node in self._option.output_nodes.values(): - output_name = MaceKeyword.mace_output_node_name \ - + '_' + output_node.name - op_def = self._model.op.add() - op_def.name = self.normalize_op_name(output_name) - op_def.type = MaceOp.Identity.name - op_def.input.extend([output_node.name]) - op_def.output.extend([output_name]) - output_shape = op_def.output_shape.add() - output_shape.dims.extend( - self._producer[output_node.name].output_shape[0].dims) - - ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT) - def sort_by_execution(self): print("Sort by execution") net = self._model @@ -1736,11 +1264,9 @@ class Transformer(base_converter.ConverterInterface): sorted_nodes = [] for output_node in self._option.output_nodes: - output_tensor = MaceKeyword.mace_output_node_name \ - + '_' + output_node - mace_check(output_tensor in self._producer, - "output_tensor %s not existed in model" % output_tensor) - self.sort_dfs(self._producer[output_tensor], visited, sorted_nodes) + mace_check(output_node in self._producer, + "output_tensor %s not existed in model" % output_node) + self.sort_dfs(self._producer[output_node], visited, sorted_nodes) del net.op[:] net.op.extend(sorted_nodes) @@ -1756,28 +1282,50 @@ class Transformer(base_converter.ConverterInterface): return False print("Add mace quantize and dequantize nodes") + input_name_map = {} + output_name_map = {} + + for input_node in self._option.input_nodes.values(): + new_input_name = MaceKeyword.mace_input_node_name \ + + '_' + input_node.name + input_name_map[input_node.name] = new_input_name + + for output_node in self._option.output_nodes.values(): + new_output_name = MaceKeyword.mace_output_node_name \ + + '_' + output_node.name + output_name_map[output_node.name] = new_output_name for op in self._model.op: + for i in range(len(op.input)): + if op.input[i] in input_name_map: + op.input[i] = input_name_map[op.input[i]] + for i in range(len(op.output)): + if op.output[i] in output_name_map: + op.output[i] = output_name_map[op.output[i]] + data_type_arg = ConverterUtil.get_arg( op, MaceKeyword.mace_op_data_type_str) mace_check(data_type_arg, "Data type does not exist for %s(%s)" % (op.name, op.type)) if data_type_arg.i == mace_pb2.DT_FLOAT: data_type_arg.i = mace_pb2.DT_UINT8 + elif data_type_arg.i == mace_pb2.DT_UINT8: + mace_check(op.type == MaceOp.Quantize.name + or op.type == MaceOp.Dequantize.name, + "Only Quantization ops support uint8, " + "but got %s(%s)" % (op.name, op.type)) else: - mace_check(False, + mace_check(op.type == MaceOp.Quantize.name, "Quantization only support float ops, " "but get %s(%s)" % (op.name, op.type)) for input_node in self._option.input_nodes.values(): - new_input_name = MaceKeyword.mace_input_node_name \ - + '_' + input_node.name op_def = self._model.op.add() op_def.name = self.normalize_op_name(input_node.name) op_def.type = MaceOp.Quantize.name - op_def.input.extend([new_input_name]) - op_def.output.extend([input_node.name]) + op_def.input.extend([input_node.name]) + op_def.output.extend([input_name_map[input_node.name]]) output_shape = op_def.output_shape.add() output_shape.dims.extend(input_node.shape) @@ -1785,13 +1333,12 @@ class Transformer(base_converter.ConverterInterface): ConverterUtil.add_data_format_arg(op_def, DataFormat.NHWC) for output_node in self._option.output_nodes.values(): - output_name = MaceKeyword.mace_output_node_name \ - + '_' + output_node.name op_def = self._model.op.add() - op_def.name = self.normalize_op_name(output_name) + op_def.name = self.normalize_op_name( + output_name_map[output_node.name]) op_def.type = MaceOp.Dequantize.name - op_def.input.extend([output_node.name]) - op_def.output.extend([output_name]) + op_def.input.extend([output_name_map[output_node.name]]) + op_def.output.extend([output_node.name]) output_shape = op_def.output_shape.add() output_shape.dims.extend( self._producer[output_node.name].output_shape[0].dims) @@ -1799,7 +1346,9 @@ class Transformer(base_converter.ConverterInterface): ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_UINT8) - self._input_output_added = True + quantize_flag_arg = self._model.arg.add() + quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str + quantize_flag_arg.i = 1 return False @@ -2057,6 +1606,8 @@ class Transformer(base_converter.ConverterInterface): if input_node.name not in self._quantize_activation_info: print("Input range %s: %s" % (input_node.name, str(input_node.range))) + new_input_name = MaceKeyword.mace_input_node_name \ + + '_' + input_node.name scale, zero, minval, maxval = \ quantize_util.adjust_range(input_node.range[0], input_node.range[1], @@ -2066,7 +1617,7 @@ class Transformer(base_converter.ConverterInterface): quantize_info.maxval = maxval quantize_info.scale = scale quantize_info.zero_point = zero - self._quantize_activation_info[input_node.name] = quantize_info + self._quantize_activation_info[new_input_name] = quantize_info return False @@ -2084,9 +1635,6 @@ class Transformer(base_converter.ConverterInterface): "missing quantize info: %s" % op) def add_opencl_informations(self): - if self._option.device != DeviceType.GPU.value: - return False - print("Add OpenCL informations") net = self._model diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py deleted file mode 100644 index 1de554d4cd5df0dadf83e60b2231750f691ded62..0000000000000000000000000000000000000000 --- a/mace/python/tools/memory_optimizer.py +++ /dev/null @@ -1,349 +0,0 @@ -# Copyright 2018 Xiaomi, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import operator - -import six - -from mace.proto import mace_pb2 - -from mace.python.tools.converter_tool import base_converter as cvt -from mace.python.tools.converter_tool.base_converter import DeviceType -from mace.python.tools.converter_tool.base_converter import ConverterUtil -from mace.python.tools.converter_tool.base_converter import MaceKeyword -from mace.python.tools.convert_util import calculate_image_shape -from mace.python.tools.convert_util import OpenCLBufferType - - -def MemoryTypeToStr(mem_type): - if mem_type == mace_pb2.CPU_BUFFER: - return 'CPU_BUFFER' - elif mem_type == mace_pb2.GPU_BUFFER: - return 'GPU_BUFFER' - elif mem_type == mace_pb2.GPU_IMAGE: - return 'GPU_IMAGE' - else: - return 'UNKNOWN' - - -class MemoryBlock(object): - def __init__(self, mem_type, block): - self._mem_type = mem_type - self._block = block - - @property - def mem_type(self): - return self._mem_type - - @property - def block(self): - return self._block - - -class MemoryOptimizer(object): - def __init__(self, net_def): - self.net_def = net_def - self.idle_mem = set() - self.op_mem = {} # op_name->mem_id - self.mem_block = {} # mem_id->[size] or mem_id->[x, y] - self.total_mem_count = 0 - self.input_ref_counter = {} - self.mem_ref_counter = {} - ocl_mem_type_arg = ConverterUtil.get_arg( - net_def, MaceKeyword.mace_opencl_mem_type) - self.cl_mem_type = ocl_mem_type_arg.i if ocl_mem_type_arg is not None \ - else None - - consumers = {} - for op in net_def.op: - if not self.op_need_optimize_memory(op): - continue - for ipt in op.input: - if ipt not in consumers: - consumers[ipt] = [] - consumers[ipt].append(op) - # only ref op's output tensor - for op in net_def.op: - if not self.op_need_optimize_memory(op): - continue - for output in op.output: - tensor_name = output - if tensor_name in consumers: - self.input_ref_counter[tensor_name] = \ - len(consumers[tensor_name]) - else: - self.input_ref_counter[tensor_name] = 0 - - def op_need_optimize_memory(self, op): - return True - - def get_op_mem_block(self, op_type, output_shape, output_type): - data_type_size = 4 - if output_type == mace_pb2.DT_UINT8: - data_type_size = 1 - return MemoryBlock(mace_pb2.CPU_BUFFER, - [six.moves.reduce(operator.mul, output_shape, 1) * - data_type_size]) - - def mem_size(self, memory_block): - return memory_block.block[0] - - def sub_mem_block(self, mem_block1, mem_block2): - return self.mem_size(mem_block1) - self.mem_size(mem_block2) - - def resize_mem_block(self, old_mem_block, op_mem_block): - return MemoryBlock( - old_mem_block.mem_type, - [max(old_mem_block.block[0], op_mem_block.block[0])]) - - def add_net_mem_blocks(self): - for mem in self.mem_block: - arena = self.net_def.mem_arena - block = arena.mem_block.add() - block.mem_id = mem - block.device_type = DeviceType.CPU.value - block.mem_type = self.mem_block[mem].mem_type - block.x = self.mem_block[mem].block[0] - block.y = 1 - - def get_total_origin_mem_size(self): - origin_mem_size = 0 - for op in self.net_def.op: - if not self.op_need_optimize_memory(op): - continue - origin_mem_size += six.moves.reduce(operator.mul, - op.output_shape[0].dims, - 1) - return origin_mem_size - - def get_total_optimized_mem_size(self): - optimized_mem_size = 0 - for mem in self.mem_block: - print(mem, MemoryTypeToStr(self.mem_block[mem].mem_type), - self.mem_block[mem].block) - optimized_mem_size += self.mem_size(self.mem_block[mem]) - return optimized_mem_size - - @staticmethod - def is_memory_reuse_op(op): - return op.type == 'Reshape' or op.type == 'Identity' \ - or op.type == 'Squeeze' or op.type == 'ExpandDims' - - def optimize(self): - for op in self.net_def.op: - if not self.op_need_optimize_memory(op): - continue - if not op.output_shape: - six.print_("WARNING: There is no output shape information to " - "do memory optimization. %s (%s)" % - (op.name, op.type), file=sys.stderr) - return - if len(op.output_shape) != len(op.output): - six.print_('WARNING: the number of output shape is ' - 'not equal to the number of output.', - file=sys.stderr) - return - for i in range(len(op.output)): - if self.is_memory_reuse_op(op): - # make these ops reuse memory of input tensor - mem_id = self.op_mem.get(op.input[0], -1) - else: - output_type = mace_pb2.DT_FLOAT - for arg in op.arg: - if arg.name == 'T': - output_type = arg.i - if len(op.output_type) > i: - output_type = op.output_type[i] - op_mem_block = self.get_op_mem_block( - op.type, - op.output_shape[i].dims, - output_type) - mem_id = -1 - if len(self.idle_mem) > 0: - best_mem_add_size = six.MAXSIZE - best_mem_waste_size = six.MAXSIZE - for mid in self.idle_mem: - old_mem_block = self.mem_block[mid] - if old_mem_block.mem_type != op_mem_block.mem_type: - continue - new_mem_block = self.resize_mem_block( - old_mem_block, op_mem_block) - add_mem_size = self.sub_mem_block(new_mem_block, - old_mem_block) - waste_mem_size = self.sub_mem_block(new_mem_block, - op_mem_block) - - # minimize add_mem_size; if best_mem_add_size is 0, - # then minimize waste_mem_size - if (best_mem_add_size > 0 and - add_mem_size < best_mem_add_size) \ - or (best_mem_add_size == 0 and - waste_mem_size < best_mem_waste_size): - best_mem_id = mid - best_mem_add_size = add_mem_size - best_mem_waste_size = waste_mem_size - best_mem_block = new_mem_block - - # if add mem size < op mem size, then reuse it - if best_mem_add_size <= self.mem_size(op_mem_block): - self.mem_block[best_mem_id] = best_mem_block - mem_id = best_mem_id - self.idle_mem.remove(mem_id) - - if mem_id == -1: - mem_id = self.total_mem_count - self.total_mem_count += 1 - self.mem_block[mem_id] = op_mem_block - - if mem_id != -1: - op.mem_id.extend([mem_id]) - self.op_mem[op.output[i]] = mem_id - if mem_id not in self.mem_ref_counter: - self.mem_ref_counter[mem_id] = 1 - else: - self.mem_ref_counter[mem_id] += 1 - - # de-ref input tensor mem - for idx in six.moves.range(len(op.input)): - ipt = op.input[idx] - if ipt in self.input_ref_counter: - self.input_ref_counter[ipt] -= 1 - if self.input_ref_counter[ipt] == 0 \ - and ipt in self.op_mem: - mem_id = self.op_mem[ipt] - self.mem_ref_counter[mem_id] -= 1 - if self.mem_ref_counter[mem_id] == 0: - self.idle_mem.add(self.op_mem[ipt]) - elif self.input_ref_counter[ipt] < 0: - raise Exception('ref count is less than 0') - - self.add_net_mem_blocks() - - print("total op: %d" % len(self.net_def.op)) - print("origin mem: %d, optimized mem: %d" % ( - self.get_total_origin_mem_size(), - self.get_total_optimized_mem_size())) - - -class GPUMemoryOptimizer(MemoryOptimizer): - def op_need_optimize_memory(self, op): - if op.type == MaceKeyword.mace_buffer_transform: - for arg in op.arg: - if arg.name == 'mode' and arg.i == 0: - return False - return op.type != MaceKeyword.mace_buffer_inverse_transform - - def get_op_image_mem_block(self, op_type, output_shape): - if op_type == 'WinogradTransform' or op_type == 'MatMul': - buffer_shape = list(output_shape) + [1] - mem_block = MemoryBlock( - mace_pb2.GPU_IMAGE, - calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT, - buffer_shape)) - elif op_type in ['Shape', - 'InferConv2dShape', - 'StridedSlice', - 'Stack', - 'ScalarMath']: - if len(output_shape) == 1: - mem_block = MemoryBlock(mace_pb2.CPU_BUFFER, - [output_shape[0], 1]) - elif len(output_shape) == 0: - mem_block = MemoryBlock(mace_pb2.CPU_BUFFER, - [1, 1]) - else: - raise Exception('%s output shape dim size is not 0 or 1.' % - op_type) - else: - if len(output_shape) == 2: # only support fc/softmax - buffer_shape = [output_shape[0], output_shape[1]] - elif len(output_shape) == 4: - buffer_shape = output_shape - else: - raise Exception('%s output shape dim size is not 2 or 4.' % - op_type) - mem_block = MemoryBlock( - mace_pb2.GPU_IMAGE, - calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL, - buffer_shape)) - return mem_block - - def get_op_buffer_mem_block(self, output_shape): - return MemoryBlock(mace_pb2.GPU_BUFFER, - [reduce(operator.mul, output_shape, 1), 1]) - - def get_op_mem_block(self, op_type, output_shape, output_type): - if self.cl_mem_type == mace_pb2.GPU_IMAGE: - return self.get_op_image_mem_block(op_type, output_shape) - else: - return self.get_op_buffer_mem_block(output_shape) - - def mem_size(self, memory_block): - if memory_block.mem_type == mace_pb2.GPU_IMAGE: - return memory_block.block[0] * memory_block.block[1] * 4 - else: - return memory_block.block[0] - - def resize_mem_block(self, old_mem_block, op_mem_block): - resize_mem_block = MemoryBlock( - old_mem_block.mem_type, - [ - max(old_mem_block.block[0], op_mem_block.block[0]), - max(old_mem_block.block[1], op_mem_block.block[1]) - ]) - - return resize_mem_block - - def add_net_mem_blocks(self): - max_image_size_x = 0 - max_image_size_y = 0 - for mem in self.mem_block: - arena = self.net_def.mem_arena - block = arena.mem_block.add() - block.mem_id = mem - block.device_type = DeviceType.GPU.value - block.mem_type = self.mem_block[mem].mem_type - block.x = self.mem_block[mem].block[0] - block.y = self.mem_block[mem].block[1] - if self.mem_block[mem].mem_type == mace_pb2.GPU_IMAGE: - max_image_size_x = max(max_image_size_x, block.x) - max_image_size_y = max(max_image_size_y, block.y) - - if self.cl_mem_type == mace_pb2.GPU_IMAGE: - # Update OpenCL max image size - net_ocl_max_img_size_arg = None - for arg in self.net_def.arg: - if arg.name == cvt.MaceKeyword.mace_opencl_max_image_size: - net_ocl_max_img_size_arg = arg - max_image_size_x = max(arg.ints[0], max_image_size_x) - max_image_size_y = max(arg.ints[1], max_image_size_y) - break - if net_ocl_max_img_size_arg is None: - net_ocl_max_img_size_arg = self.net_def.arg.add() - net_ocl_max_img_size_arg.name = \ - cvt.MaceKeyword.mace_opencl_max_image_size - - net_ocl_max_img_size_arg.ints[:] = [max_image_size_x, - max_image_size_y] - - -def optimize_gpu_memory(net_def): - mem_optimizer = GPUMemoryOptimizer(net_def) - mem_optimizer.optimize() - - -def optimize_cpu_memory(net_def): - mem_optimizer = MemoryOptimizer(net_def) - mem_optimizer.optimize() diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2 index 3f4ba1c4f5d907352a0cee9bca719fa29be08768..f985a75abc718c32adf1215777a01aaaa8fe5df9 100644 --- a/mace/python/tools/model.jinja2 +++ b/mace/python/tools/model.jinja2 @@ -80,6 +80,7 @@ void CreateInputInfo(NetDef *net_def) { input_info = net_def->add_input_info(); input_info->set_name({{ net.input_info[idx].name|tojson }}); input_info->set_data_type(static_cast({{ net.input_info[idx].data_type }})); + input_info->set_data_format(static_cast({{ net.input_info[idx].data_format }})); input_info->mutable_dims()->Reserve({{ net.input_info[idx].dims|length }}); {% for dim in net.input_info[idx].dims %} input_info->add_dims({{ dim }}); @@ -96,6 +97,7 @@ void CreateOutputInfo(NetDef *net_def) { output_info = net_def->add_output_info(); output_info->set_name({{ net.output_info[idx].name|tojson }}); output_info->set_data_type(static_cast({{ net.output_info[idx].data_type }})); + output_info->set_data_format(static_cast({{ net.output_info[idx].data_format }})); output_info->mutable_dims()->Reserve({{ net.output_info[idx].dims|length }}); {% for dim in net.output_info[idx].dims %} output_info->add_dims({{dim}}); @@ -121,23 +123,6 @@ void CreateTensors(NetDef *net_def) { mace::{{tag}}::CreateTensor{{ i }}(net_def->add_tensors()); {% endfor %} } - -{% if net.mem_arena.mem_block|length != 0 %} -void CreateMemoryArena(mace::MemoryArena *mem_arena) { - mem_arena->mutable_mem_block()->Reserve({{ net.mem_arena.mem_block|length }}); - {% for i in range(net.mem_arena.mem_block|length) %} - - mace::MemoryBlock* mem_block{{i}} = mem_arena->add_mem_block(); - mem_block{{i}}->set_mem_id({{net.mem_arena.mem_block[i].mem_id}}); - mem_block{{i}}->set_device_type(static_cast({{net.mem_arena.mem_block[i].device_type}})); - mem_block{{i}}->set_mem_type(static_cast({{net.mem_arena.mem_block[i].mem_type}})); - mem_block{{i}}->set_x({{net.mem_arena.mem_block[i].x}}); - mem_block{{i}}->set_y({{net.mem_arena.mem_block[i].y}}); - - {% endfor %} -} -{% endif %} - } // namespace namespace {{tag}} { @@ -153,9 +138,6 @@ const std::shared_ptr CreateNet() { {% if net.arg|length != 0 %} CreateNetArg(net_def.get()); {% endif %} - {% if net.mem_arena.mem_block|length != 0 %} - CreateMemoryArena(net_def->mutable_mem_arena()); - {% endif %} {% if net.input_info | length > 0 %} CreateInputInfo(net_def.get()); {% endif %} @@ -179,8 +161,8 @@ const std::string ModelBuildTime() { } const std::string ModelBuildOptions() { - return {{ "runtime: {}, obfuscate: {}, embed_model_data: {}, winograd: {}" - .format(runtime, obfuscate, embed_model_data, winograd_conv)|tojson }}; + return {{ "obfuscate: {}, embed_model_data: {}, winograd: {}" + .format(obfuscate, embed_model_data, winograd_conv)|tojson }}; } } // namespace {{tag}} diff --git a/mace/python/tools/model_saver.py b/mace/python/tools/model_saver.py index ea90a6264cc6a8697fc1857d0ff58833abd29389..c2221426f03ead1e6709fb756521d4ca75bc731a 100644 --- a/mace/python/tools/model_saver.py +++ b/mace/python/tools/model_saver.py @@ -20,6 +20,7 @@ import hashlib from enum import Enum from mace.proto import mace_pb2 +from mace.python.tools.converter_tool import base_converter as cvt from mace.python.tools.convert_util import mace_check from jinja2 import Environment, FileSystemLoader @@ -82,20 +83,24 @@ def generate_in_out_map(ops, tensor_map): return in_out_map -def obfuscate_name(net_def): - input_node = "mace_input_node" - output_node = "mace_output_node" +def obfuscate_name(option, net_def): + input_nodes = set() + for name in option.input_nodes: + input_nodes.add(name) + output_nodes = set() + for name in option.output_nodes: + output_nodes.add(name) tensor_map = generate_tensor_map(net_def.tensors) in_out_map = generate_in_out_map(net_def.op, tensor_map) for t in net_def.tensors: - if input_node not in t.name and output_node not in t.name: + if t.name not in input_nodes and t.name not in output_nodes: t.name = tensor_map[t.name] for op in net_def.op: for i in range(len(op.input)): - if input_node not in op.input[i]: + if op.input[i] not in input_nodes: op.input[i] = in_out_map[op.input[i]] for i in range(len(op.output)): - if output_node not in op.output[i]: + if op.output[i] not in output_nodes: op.output[i] = in_out_map[op.output[i]] @@ -124,15 +129,14 @@ class TensorInfo: tensor.data_type) -def update_tensor_infos(net_def, runtime, data_type): +def update_tensor_infos(net_def, data_type, device): offset = 0 counter = 0 tensor_infos = [] for tensor in net_def.tensors: - # update data_type - if tensor.data_type == mace_pb2.DT_FLOAT and runtime == 'gpu' \ - and data_type == GPUDataType.fp16_fp32: - tensor.data_type = mace_pb2.DT_HALF + if device == cvt.DeviceType.GPU.value and\ + tensor.data_type == mace_pb2.DT_FLOAT: + tensor.data_type = data_type # Add offset and data_size tensor_info = TensorInfo(counter, tensor) @@ -195,7 +199,7 @@ def save_model_to_proto(net_def, model_tag, output_dir): f.write(str(net_def)) -def save_model_to_code(net_def, model_tag, runtime, +def save_model_to_code(net_def, model_tag, device, template_dir, output_dir, embed_model_data, model_checksum, weight_checksum, obfuscate, winograd_conv): @@ -241,7 +245,7 @@ def save_model_to_code(net_def, model_tag, runtime, end=min(start + 10, op_size), net=net_def, tag=model_tag, - runtime=runtime, + device=device, ) with open(output_dir + 'op' + str(counter) + '.cc', "w") as f: f.write(source) @@ -256,7 +260,6 @@ def save_model_to_code(net_def, model_tag, runtime, source = j2_env.get_template(template_name).render( net=net_def, tag=model_tag, - runtime=runtime, obfuscate=obfuscate, embed_model_data=embed_model_data, winograd_conv=winograd_conv, @@ -272,15 +275,15 @@ def save_model_to_code(net_def, model_tag, runtime, f.write(source) -def save_model(net_def, model_checksum, weight_checksum, template_dir, - obfuscate, model_tag, output_dir, runtime, embed_model_data, - winograd_conv, data_type, model_graph_format): +def save_model(option, net_def, model_checksum, weight_checksum, template_dir, + obfuscate, model_tag, output_dir, embed_model_data, + winograd_conv, model_graph_format): if obfuscate: - obfuscate_name(net_def) + obfuscate_name(option, net_def) output_dir = output_dir + '/' # update tensor type - update_tensor_infos(net_def, runtime, data_type) + update_tensor_infos(net_def, option.data_type, option.device) if model_graph_format == ModelFormat.file or not embed_model_data: save_model_data(net_def, model_tag, output_dir) @@ -288,7 +291,7 @@ def save_model(net_def, model_checksum, weight_checksum, template_dir, if model_graph_format == ModelFormat.file: save_model_to_proto(net_def, model_tag, output_dir) else: - save_model_to_code(net_def, model_tag, runtime, + save_model_to_code(net_def, model_tag, option.device, template_dir, output_dir, embed_model_data, model_checksum, weight_checksum, obfuscate, winograd_conv) diff --git a/mace/python/tools/operator.jinja2 b/mace/python/tools/operator.jinja2 index e3492ddf7f112fef067c1b51ef7ba83b3065711d..7b4c95029d4d087d438c3f019cac51275880ce1e 100644 --- a/mace/python/tools/operator.jinja2 +++ b/mace/python/tools/operator.jinja2 @@ -132,7 +132,7 @@ void CreateOperator{{i}}(mace::OperatorDef *op) { quantize_info{{j}}->set_maxval({{ net.op[i].quantize_info[j].maxval }}); {% endfor %} - {% if runtime == 'dsp' %} + {% if device == 3 %} op->set_padding({{ net.op[i].padding }}); {% if net.op[i].node_input | length > 0 %} std::vector input_node_ids({ {{ net.op[i].node_input | map(attribute='node_id') | join(', ') }} }); diff --git a/mace/test/BUILD b/mace/test/BUILD index 04253cda9a117cd6b7905837e8e4a09ffdd1ca21..593076a385725ae6058d7aa0070d1f03d1b9caba 100644 --- a/mace/test/BUILD +++ b/mace/test/BUILD @@ -6,6 +6,14 @@ licenses(["notice"]) # Apache 2.0 load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled", "if_android_armv7", "if_hexagon_enabled") +cc_library( + name = "mace_api_test_header", + hdrs = [ + "mace_api_test.h", + ], + copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"], +) + cc_test( name = "mace_api_test", testonly = 1, @@ -20,6 +28,7 @@ cc_test( linkopts = ["-fopenmp"], linkstatic = 1, deps = [ + ":mace_api_test_header", "//mace/ops:test", "//mace/libmace:libmace", "@gtest//:gtest_main", @@ -40,6 +49,7 @@ cc_test( linkopts = ["-fopenmp"], linkstatic = 1, deps = [ + ":mace_api_test_header", "//mace/ops:test", "//mace/libmace:libmace", "@gtest//:gtest_main", diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc index 0bb8342dcffe376598fd061795b9eb26e971ce65..cce492736b6344ae2e064cab87e8ca687c37ed36 100644 --- a/mace/test/mace_api_mt_test.cc +++ b/mace/test/mace_api_mt_test.cc @@ -12,12 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include // NOLINT(build/c++11) -#include "mace/core/operator.h" -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/ops_test_util.h" +#include "mace/test/mace_api_test.h" namespace mace { namespace test { @@ -26,253 +23,6 @@ class MaceMTAPITest : public ::testing::Test {}; namespace { -void GenerateInputs(const std::vector &input_names, - const std::vector &input_shape, - std::map *inputs) { - size_t input_size = input_names.size(); - for (size_t i = 0; i < input_size; ++i) { - // Allocate input and output - int64_t input_size = - std::accumulate(input_shape.begin(), input_shape.end(), 1, - std::multiplies()); - auto buffer_in = std::shared_ptr(new float[input_size], - std::default_delete()); - // load input - std::vector input_data; - ops::test::GenerateRandomRealTypeData(input_shape, &input_data); - memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float)); - (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in); - } -} - -void GenerateOutputs(const std::vector &output_names, - const std::vector &output_shape, - std::map *outputs) { - size_t output_size = output_names.size(); - for (size_t i = 0; i < output_size; ++i) { - int64_t output_size = - std::accumulate(output_shape.begin(), output_shape.end(), 1, - std::multiplies()); - auto buffer_out = std::shared_ptr(new float[output_size], - std::default_delete()); - (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out); - } -} - -template -void BufferToImage(const std::string &input_name, - const std::string &output_name, - const int buffer_type, - const std::vector &mem_ids, - const DeviceType device_type, - NetDef *net_def, - const int mode = NetMode::NORMAL) { - OperatorDef operator_def; - - ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", buffer_type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .AddIntArg("mode", mode) - .Finalize(&operator_def); - - operator_def.mutable_mem_id()->Reserve(mem_ids.size()); - for (auto mem_id : mem_ids) { - operator_def.add_mem_id(mem_id); - } - net_def->add_op()->CopyFrom(operator_def); -} - -template -void ImageToBuffer(const std::string &input_name, - const std::string &output_name, - const int buffer_type, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - - ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", buffer_type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - net_def->add_op()->CopyFrom(operator_def); -} - -template -void Conv3x3(const std::string &input_name, - const std::string &filter_name, - const std::string &output_name, - const std::vector &mem_ids, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - ops::test::OpDefBuilder("Conv2D", "Conv2dOp") - .Input(input_name) - .Input(filter_name) - .Output(output_name) - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - operator_def.mutable_mem_id()->Reserve(mem_ids.size()); - for (auto mem_id : mem_ids) { - operator_def.add_mem_id(mem_id); - } - net_def->add_op()->CopyFrom(operator_def); -} - -template -void Relu(const std::string &input_name, - const std::string &output_name, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - ops::test::OpDefBuilder("Activation", "ReluTest") - .Input(input_name) - .Output(output_name) - .AddStringArg("activation", "RELU") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - net_def->add_op()->CopyFrom(operator_def); -} - -template -void AddTensor(const std::string &name, - const std::vector &shape, - const int offset, - const int data_size, - NetDef *net_def) { - ConstTensor *tensor_ptr = net_def->add_tensors(); - tensor_ptr->set_name(name); - tensor_ptr->mutable_dims()->Reserve(shape.size()); - for (auto dim : shape) { - tensor_ptr->add_dims(dim); - } - tensor_ptr->set_offset(offset); - tensor_ptr->set_data_size(data_size); - tensor_ptr->set_data_type(DataTypeToEnum::value); -} - -template -void CheckOutputs(const NetDef &net_def, - const std::map &inputs, - const std::map &outputs, - const std::vector &tensor_data) { - ops::test::OpsTestNet net; - for (auto input : inputs) { - auto input_shape = input.second.shape(); - const int64_t data_size = std::accumulate(input_shape.begin(), - input_shape.end(), 1, - std::multiplies()); - std::vector input_data(data_size); - memcpy(input_data.data(), input.second.data().get(), - data_size * sizeof(float)); - std::string input_name = MakeString("mace_input_node_", - input.first); - net.AddInputFromArray(input_name, input.second.shape(), - input_data); - } - auto tensors = net_def.tensors(); - for (auto tensor : tensors) { - std::vector shape = {tensor.dims().begin(), tensor.dims().end()}; - const int64_t data_size = std::accumulate(shape.begin(), - shape.end(), 1, - std::multiplies()); - std::vector data(data_size); - memcpy(data.data(), - reinterpret_cast(tensor_data.data()) + tensor.offset(), - tensor.data_size() * sizeof(T)); - net.AddInputFromArray(tensor.name(), shape, data); - } - net.RunNet(net_def, D); - - for (auto output : outputs) { - std::unique_ptr tmp_tensor( - new Tensor(GetCPUAllocator(), - DataTypeToEnum::v())); - auto output_shape = output.second.shape(); - const int64_t data_size = std::accumulate(output_shape.begin(), - output_shape.end(), 1, - std::multiplies()); - tmp_tensor->Resize(output.second.shape()); - float *data = tmp_tensor->mutable_data(); - memcpy(data, output.second.data().get(), data_size * sizeof(float)); - std::string output_name = MakeString("mace_output_node_", - output.first); - ops::test::ExpectTensorNear(*tmp_tensor, - *net.GetOutput(output_name.data()), - 1e-5); - } -} - -std::map AddMemoryOptimization( - const std::vector &input_names, - const std::vector &output_names, - const std::vector> &input_shapes, - const std::vector> &output_shapes, - NetDef *net_def) { - std::map res; - int mem_id = 0; - size_t input_shape_size = input_shapes.size(); - uint32_t in_mem_block_x = 0; - uint32_t in_mem_block_y = 0; - for (size_t i = 0; i < input_shape_size; ++i) { - in_mem_block_x = std::max(in_mem_block_x, - input_shapes[i][2] * - RoundUpDiv4(input_shapes[i][3])); - in_mem_block_y = std::max(in_mem_block_y, - input_shapes[i][0] * - input_shapes[i][1]); - } - size_t input_size = input_names.size(); - size_t output_size = output_names.size(); - MemoryArena *mem_arena_ptr = net_def->mutable_mem_arena(); - mem_arena_ptr->mutable_mem_block()->Reserve(input_size + output_size); - for (size_t i = 0; i < input_size; ++i) { - MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); - mem_blk_ptr->set_mem_id(mem_id); - mem_blk_ptr->set_device_type(DeviceType::GPU); - mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); - mem_blk_ptr->set_x(in_mem_block_x); - mem_blk_ptr->set_y(in_mem_block_y); - res[input_names[i]] = mem_id; - mem_id++; - } - size_t output_shape_size = output_shapes.size(); - uint32_t out_mem_block_x = 0; - uint32_t out_mem_block_y = 0; - for (size_t i = 0; i < output_shape_size; ++i) { - out_mem_block_x = std::max(out_mem_block_x, - output_shapes[i][2] * - RoundUpDiv4(output_shapes[i][3])); - out_mem_block_y = std::max(out_mem_block_y, - output_shapes[i][0] * - output_shapes[i][1]); - } - for (size_t i = 0; i < output_size; ++i) { - MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); - mem_blk_ptr->set_mem_id(mem_id); - mem_blk_ptr->set_device_type(DeviceType::GPU); - mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); - mem_blk_ptr->set_x(out_mem_block_x); - mem_blk_ptr->set_y(out_mem_block_y); - res[output_names[i]] = mem_id; - mem_id++; - } - return res; -} - // The height and width of input and output must be equal. void MaceRunFunc(const int in_out_size) { std::vector input_names; @@ -282,7 +32,6 @@ void MaceRunFunc(const int in_out_size) { output_names.push_back(MakeString("output", i)); } std::string filter_tensor_name = "filter"; - std::string filter_tensor_img_name = filter_tensor_name + "_image"; const DeviceType device = DeviceType::GPU; @@ -292,10 +41,6 @@ void MaceRunFunc(const int in_out_size) { std::shared_ptr net_def(new NetDef()); - // Add memory optimization - auto mem_map = AddMemoryOptimization(input_names, output_names, - input_shapes, output_shapes, - net_def.get()); std::vector data; ops::test::GenerateRandomRealTypeData(filter_shape, &data); @@ -303,35 +48,21 @@ void MaceRunFunc(const int in_out_size) { filter_tensor_name, filter_shape, 0, data.size(), net_def.get()); for (size_t i = 0; i < input_names.size(); ++i) { - std::string input_name = MakeString("mace_input_node_", - input_names[i]); - BufferToImage(input_name, input_names[i], - mace::ops::IN_OUT_CHANNEL, - {mem_map[input_names[i]]}, - device, - net_def.get()); InputInfo *info = net_def->add_input_info(); info->set_name(input_names[i]); - } - BufferToImage(filter_tensor_name, filter_tensor_img_name, - mace::ops::CONV2D_FILTER, {}, device, - net_def.get(), NetMode::INIT); - for (size_t i = 0; i < output_names.size(); ++i) { - Conv3x3(input_names[i], filter_tensor_img_name, - output_names[i], {mem_map[output_names[i]]}, - device, - net_def.get()); + for (auto d : input_shapes[0]) { + info->add_dims(static_cast(d)); + } } for (size_t i = 0; i < output_names.size(); ++i) { - std::string output_name = MakeString("mace_output_node_", - output_names[i]); - ImageToBuffer(output_names[i], output_name, - mace::ops::IN_OUT_CHANNEL, - device, - net_def.get()); OutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } + for (size_t i = 0; i < output_names.size(); ++i) { + Conv3x3(input_names[i], filter_tensor_name, + output_names[i], output_shapes[0], + net_def.get()); + } MaceEngineConfig config(DeviceType::GPU); diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc index 127e58492f4f2178744267f1d61df9edc0345e8f..48011ace3e9bf50a1fff270a206e57b9b698f044 100644 --- a/mace/test/mace_api_test.cc +++ b/mace/test/mace_api_test.cc @@ -12,12 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. - -#include - -#include "mace/ops/conv_pool_2d_util.h" -#include "mace/ops/ops_test_util.h" -#include "mace/public/mace.h" +#include "mace/test/mace_api_test.h" namespace mace { namespace test { @@ -26,258 +21,11 @@ class MaceAPITest : public ::testing::Test {}; namespace { -void GenerateInputs(const std::vector &input_names, - const std::vector &input_shape, - std::map *inputs) { - size_t input_size = input_names.size(); - for (size_t i = 0; i < input_size; ++i) { - // Allocate input and output - int64_t input_size = - std::accumulate(input_shape.begin(), input_shape.end(), 1, - std::multiplies()); - auto buffer_in = std::shared_ptr(new float[input_size], - std::default_delete()); - // load input - std::vector input_data; - ops::test::GenerateRandomRealTypeData(input_shape, &input_data); - memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float)); - (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in); - } -} - -void GenerateOutputs(const std::vector &output_names, - const std::vector &output_shape, - std::map *outputs) { - size_t output_size = output_names.size(); - for (size_t i = 0; i < output_size; ++i) { - int64_t output_size = - std::accumulate(output_shape.begin(), output_shape.end(), 1, - std::multiplies()); - auto buffer_out = std::shared_ptr(new float[output_size], - std::default_delete()); - (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out); - } -} - -template -void BufferToImage(const std::string &input_name, - const std::string &output_name, - const int buffer_type, - const std::vector &mem_ids, - const DeviceType device_type, - NetDef *net_def, - const int mode = NetMode::NORMAL) { - OperatorDef operator_def; - - ops::test::OpDefBuilder("BufferTransform", "BufferTransformOp") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", buffer_type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .AddIntArg("mode", mode) - .Finalize(&operator_def); - - operator_def.mutable_mem_id()->Reserve(mem_ids.size()); - for (auto mem_id : mem_ids) { - operator_def.add_mem_id(mem_id); - } - - net_def->add_op()->CopyFrom(operator_def); -} - -template -void ImageToBuffer(const std::string &input_name, - const std::string &output_name, - const int buffer_type, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - - ops::test::OpDefBuilder("BufferInverseTransform", "BufferInverseTransformOp") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", buffer_type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - net_def->add_op()->CopyFrom(operator_def); -} - -template -void Conv3x3(const std::string &input_name, - const std::string &filter_name, - const std::string &output_name, - const std::vector &mem_ids, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - ops::test::OpDefBuilder("Conv2D", "Conv2dOp") - .Input(input_name) - .Input(filter_name) - .Output(output_name) - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - operator_def.mutable_mem_id()->Reserve(mem_ids.size()); - for (auto mem_id : mem_ids) { - operator_def.add_mem_id(mem_id); - } - net_def->add_op()->CopyFrom(operator_def); -} - -template -void Relu(const std::string &input_name, - const std::string &output_name, - const DeviceType device_type, - NetDef *net_def) { - OperatorDef operator_def; - ops::test::OpDefBuilder("Activation", "ReluTest") - .Input(input_name) - .Output(output_name) - .AddStringArg("activation", "RELU") - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .AddIntArg("device", static_cast(device_type)) - .Finalize(&operator_def); - - net_def->add_op()->CopyFrom(operator_def); -} - -template -void AddTensor(const std::string &name, - const std::vector &shape, - const int offset, - const int data_size, - NetDef *net_def) { - ConstTensor *tensor_ptr = net_def->add_tensors(); - tensor_ptr->set_name(name); - tensor_ptr->mutable_dims()->Reserve(shape.size()); - for (auto dim : shape) { - tensor_ptr->add_dims(dim); - } - tensor_ptr->set_offset(offset); - tensor_ptr->set_data_size(data_size); - tensor_ptr->set_data_type(DataTypeToEnum::value); -} - -template -void CheckOutputs(const NetDef &net_def, - const std::map &inputs, - const std::map &outputs, - const std::vector &tensor_data) { - ops::test::OpsTestNet net; - for (auto input : inputs) { - auto input_shape = input.second.shape(); - const int64_t data_size = std::accumulate(input_shape.begin(), - input_shape.end(), 1, - std::multiplies()); - std::vector input_data(data_size); - memcpy(input_data.data(), input.second.data().get(), - data_size * sizeof(float)); - std::string input_name = MakeString("mace_input_node_", - input.first); - net.AddInputFromArray(input_name, input.second.shape(), - input_data); - } - auto tensors = net_def.tensors(); - for (auto tensor : tensors) { - std::vector shape = {tensor.dims().begin(), tensor.dims().end()}; - const int64_t data_size = std::accumulate(shape.begin(), - shape.end(), 1, - std::multiplies()); - std::vector data(data_size); - memcpy(data.data(), - reinterpret_cast(tensor_data.data()) + tensor.offset(), - tensor.data_size() * sizeof(T)); - net.AddInputFromArray(tensor.name(), shape, data); - } - net.RunNet(net_def, D); - - std::unique_ptr allocator(new CPUAllocator); - for (auto output : outputs) { - std::unique_ptr tmp_tensor( - new Tensor(allocator.get(), - DataTypeToEnum::v())); - auto output_shape = output.second.shape(); - const int64_t data_size = std::accumulate(output_shape.begin(), - output_shape.end(), 1, - std::multiplies()); - tmp_tensor->Resize(output.second.shape()); - float *data = tmp_tensor->mutable_data(); - memcpy(data, output.second.data().get(), data_size * sizeof(float)); - std::string output_name = MakeString("mace_output_node_", - output.first); - ops::test::ExpectTensorNear(*tmp_tensor, - *net.GetOutput(output_name.data()), - 1e-5); - } -} - -std::map AddMemoryOptimization( - const std::vector &input_names, - const std::vector &output_names, - const std::vector> &input_shapes, - const std::vector> &output_shapes, - NetDef *net_def) { - std::map res; - int mem_id = 0; - size_t input_shape_size = input_shapes.size(); - uint32_t in_mem_block_x = 0; - uint32_t in_mem_block_y = 0; - for (size_t i = 0; i < input_shape_size; ++i) { - in_mem_block_x = std::max(in_mem_block_x, - input_shapes[i][2] * - RoundUpDiv4(input_shapes[i][3])); - in_mem_block_y = std::max(in_mem_block_y, - input_shapes[i][0] * - input_shapes[i][1]); - } - size_t input_size = input_names.size(); - size_t output_size = output_names.size(); - MemoryArena *mem_arena_ptr = net_def->mutable_mem_arena(); - mem_arena_ptr->mutable_mem_block()->Reserve(input_size + output_size); - for (size_t i = 0; i < input_size; ++i) { - MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); - mem_blk_ptr->set_mem_id(mem_id); - mem_blk_ptr->set_device_type(DeviceType::GPU); - mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); - mem_blk_ptr->set_x(in_mem_block_x); - mem_blk_ptr->set_y(in_mem_block_y); - res[input_names[i]] = mem_id; - mem_id++; - } - size_t output_shape_size = output_shapes.size(); - uint32_t out_mem_block_x = 0; - uint32_t out_mem_block_y = 0; - for (size_t i = 0; i < output_shape_size; ++i) { - out_mem_block_x = std::max(out_mem_block_x, - output_shapes[i][2] * - RoundUpDiv4(output_shapes[i][3])); - out_mem_block_y = std::max(out_mem_block_y, - output_shapes[i][0] * - output_shapes[i][1]); - } - for (size_t i = 0; i < output_size; ++i) { - MemoryBlock *mem_blk_ptr = mem_arena_ptr->add_mem_block(); - mem_blk_ptr->set_mem_id(mem_id); - mem_blk_ptr->set_device_type(DeviceType::GPU); - mem_blk_ptr->set_mem_type(MemoryType::GPU_IMAGE); - mem_blk_ptr->set_x(out_mem_block_x); - mem_blk_ptr->set_y(out_mem_block_y); - res[output_names[i]] = mem_id; - mem_id++; - } - return res; -} // The height and width of input and output must be equal. -template +template void MaceRun(const int in_out_size, + const std::vector &max_shape, const std::vector> &input_shapes, const std::vector> &output_shapes, const std::vector &filter_shape) { @@ -288,52 +36,31 @@ void MaceRun(const int in_out_size, output_names.push_back(MakeString("output", i)); } std::string filter_tensor_name = "filter"; - std::string filter_tensor_img_name = filter_tensor_name + "_image"; - - const DeviceType device = DeviceType::GPU; std::shared_ptr net_def(new NetDef()); - // Add memory optimization - auto mem_map = AddMemoryOptimization(input_names, output_names, - input_shapes, output_shapes, - net_def.get()); - std::vector data; ops::test::GenerateRandomRealTypeData(filter_shape, &data); AddTensor(filter_tensor_name, filter_shape, 0, data.size(), net_def.get()); for (size_t i = 0; i < input_names.size(); ++i) { - std::string input_name = MakeString("mace_input_node_", - input_names[i]); - BufferToImage(input_name, input_names[i], - mace::ops::IN_OUT_CHANNEL, - {mem_map[input_names[i]]}, - device, - net_def.get()); InputInfo *info = net_def->add_input_info(); info->set_name(input_names[i]); - } - BufferToImage(filter_tensor_name, filter_tensor_img_name, - mace::ops::CONV2D_FILTER, {}, device, - net_def.get(), NetMode::INIT); - for (size_t i = 0; i < output_names.size(); ++i) { - Conv3x3(input_names[i], filter_tensor_img_name, - output_names[i], {mem_map[output_names[i]]}, - device, net_def.get()); + for (auto d : max_shape) { + info->add_dims(static_cast(d)); + } } for (size_t i = 0; i < output_names.size(); ++i) { - std::string output_name = MakeString("mace_output_node_", - output_names[i]); - ImageToBuffer(output_names[i], output_name, - mace::ops::IN_OUT_CHANNEL, - device, - net_def.get()); OutputInfo *info = net_def->add_output_info(); info->set_name(output_names[i]); } + for (size_t i = 0; i < output_names.size(); ++i) { + Conv3x3(input_names[i], filter_tensor_name, + output_names[i], max_shape, + net_def.get()); + } - MaceEngineConfig config(DeviceType::GPU); + MaceEngineConfig config(D); MaceEngine engine(config); MaceStatus status = engine.Init(net_def.get(), input_names, output_names, @@ -354,36 +81,64 @@ void MaceRun(const int in_out_size, } } - CheckOutputs(*net_def, inputs, outputs, data); + CheckOutputs(*net_def, inputs, outputs, data); } } // namespace -TEST_F(MaceAPITest, GPUSingleInputOutput) { - MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3}); - MaceRun(1, {{1, 32, 32, 16}}, {{1, 32, 32, 16}}, {16, 16, 3, 3}); +TEST_F(MaceAPITest, SingleInputOutput) { + MaceRun(1, + {1, 32, 32, 16}, + {{1, 32, 32, 16}}, + {{1, 32, 32, 16}}, + {16, 16, 3, 3}); + MaceRun(1, + {1, 32, 32, 16}, + {{1, 32, 32, 16}}, + {{1, 32, 32, 16}}, + {16, 16, 3, 3}); + MaceRun(1, + {1, 32, 32, 16}, + {{1, 32, 32, 16}}, + {{1, 32, 32, 16}}, + {16, 16, 3, 3}); } -TEST_F(MaceAPITest, GPUMultipleInputOutput) { - MaceRun(2, - {{1, 16, 32, 16}}, - {{1, 16, 32, 16}}, - {16, 16, 3, 3}); - MaceRun(2, - {{1, 16, 32, 16}}, - {{1, 16, 32, 16}}, - {16, 16, 3, 3}); +TEST_F(MaceAPITest, MultipleInputOutput) { + MaceRun(2, + {1, 16, 32, 16}, + {{1, 16, 32, 16}}, + {{1, 16, 32, 16}}, + {16, 16, 3, 3}); + MaceRun(2, + {1, 16, 32, 16}, + {{1, 16, 32, 16}}, + {{1, 16, 32, 16}}, + {16, 16, 3, 3}); + MaceRun(2, + {1, 16, 32, 16}, + {{1, 16, 32, 16}}, + {{1, 16, 32, 16}}, + {16, 16, 3, 3}); } -TEST_F(MaceAPITest, GPUVariableInputShape) { - MaceRun(1, - {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {16, 16, 3, 3}); - MaceRun(2, - {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {{1, 16, 32, 16}, {1, 32, 64, 16}}, - {16, 16, 3, 3}); +TEST_F(MaceAPITest, VariableInputShape) { + // TODO(liyin): there is a bug of cpu convolution +// MaceRun(1, +// {1, 32, 64, 16}, +// {{1, 16, 32, 16}, {1, 32, 64, 16}}, +// {{1, 16, 32, 16}, {1, 32, 64, 16}}, +// {16, 16, 3, 3}); + MaceRun(1, + {1, 32, 64, 16}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {16, 16, 3, 3}); + MaceRun(2, + {1, 32, 64, 16}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {{1, 16, 32, 16}, {1, 32, 64, 16}}, + {16, 16, 3, 3}); } } // namespace test diff --git a/mace/test/mace_api_test.h b/mace/test/mace_api_test.h new file mode 100644 index 0000000000000000000000000000000000000000..f43815ccc0d39596229108c62d6082e120ed71ae --- /dev/null +++ b/mace/test/mace_api_test.h @@ -0,0 +1,187 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_TEST_MACE_API_TEST_H_ +#define MACE_TEST_MACE_API_TEST_H_ + +#include +#include +#include +#include +#include + +#include "mace/ops/conv_pool_2d_util.h" +#include "mace/ops/ops_test_util.h" +#include "mace/public/mace.h" + +namespace mace { +namespace test { + +inline void GenerateInputs(const std::vector &input_names, + const std::vector &input_shape, + std::map *inputs) { + size_t input_size = input_names.size(); + for (size_t i = 0; i < input_size; ++i) { + // Allocate input and output + int64_t input_size = + std::accumulate(input_shape.begin(), input_shape.end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + // load input + std::vector input_data; + ops::test::GenerateRandomRealTypeData(input_shape, &input_data); + memcpy(buffer_in.get(), input_data.data(), input_size * sizeof(float)); + (*inputs)[input_names[i]] = mace::MaceTensor(input_shape, buffer_in); + } +} + +inline void GenerateOutputs(const std::vector &output_names, + const std::vector &output_shape, + std::map *outputs) { + size_t output_size = output_names.size(); + for (size_t i = 0; i < output_size; ++i) { + int64_t output_size = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + (*outputs)[output_names[i]] = mace::MaceTensor(output_shape, buffer_out); + } +} + +template +void Conv3x3(const std::string &input_name, + const std::string &filter_name, + const std::string &output_name, + const std::vector &output_shape, + NetDef *net_def) { + OperatorDef operator_def; + ops::test::OpDefBuilder("Conv2D", "Conv2dOp") + .Input(input_name) + .Input(filter_name) + .Output(output_name) + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(&operator_def); + + OutputShape *shape = operator_def.add_output_shape(); + for (auto dim : output_shape) { + shape->add_dims(dim); + } + + net_def->add_op()->CopyFrom(operator_def); +} + +template +void Relu(const std::string &input_name, + const std::string &output_name, + const DeviceType device_type, + NetDef *net_def) { + OperatorDef operator_def; + ops::test::OpDefBuilder("Activation", "ReluTest") + .Input(input_name) + .Output(output_name) + .AddStringArg("activation", "RELU") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("device", static_cast(device_type)) + .Finalize(&operator_def); + + net_def->add_op()->CopyFrom(operator_def); +} + +template +void AddTensor(const std::string &name, + const std::vector &shape, + const int offset, + const int data_size, + NetDef *net_def) { + ConstTensor *tensor_ptr = net_def->add_tensors(); + tensor_ptr->set_name(name); + tensor_ptr->mutable_dims()->Reserve(shape.size()); + for (auto dim : shape) { + tensor_ptr->add_dims(dim); + } + tensor_ptr->set_offset(offset); + tensor_ptr->set_data_size(data_size); + tensor_ptr->set_data_type(DataTypeToEnum::value); +} + +template +void CheckOutputs(const NetDef &net_def, + const std::map &inputs, + const std::map &outputs, + const std::vector &tensor_data) { + ops::test::OpsTestNet net; + for (auto input : inputs) { + auto input_shape = input.second.shape(); + const int64_t data_size = std::accumulate(input_shape.begin(), + input_shape.end(), 1, + std::multiplies()); + std::vector input_data(data_size); + memcpy(input_data.data(), input.second.data().get(), + data_size * sizeof(float)); + if (D == DeviceType::CPU) { + std::string input_name = input.first + "NHWC"; + net.AddInputFromArray(input_name, input_shape, input_data); + net.TransformDataFormat(input_name, NHWC, input.first, NCHW); + } else { + net.AddInputFromArray(input.first, input_shape, input_data); + } + } + auto tensors = net_def.tensors(); + for (auto tensor : tensors) { + std::vector shape = {tensor.dims().begin(), tensor.dims().end()}; + const int64_t data_size = std::accumulate(shape.begin(), + shape.end(), 1, + std::multiplies()); + std::vector data(data_size); + memcpy(data.data(), + reinterpret_cast(tensor_data.data()) + tensor.offset(), + tensor.data_size() * sizeof(T)); + net.AddInputFromArray(tensor.name(), shape, data); + } + net.RunNet(net_def, D); + + std::unique_ptr allocator(new CPUAllocator); + for (auto output : outputs) { + std::unique_ptr tmp_tensor( + new Tensor(allocator.get(), + DataTypeToEnum::v())); + auto output_shape = output.second.shape(); + const int64_t data_size = std::accumulate(output_shape.begin(), + output_shape.end(), 1, + std::multiplies()); + tmp_tensor->Resize(output.second.shape()); + float *data = tmp_tensor->mutable_data(); + memcpy(data, output.second.data().get(), data_size * sizeof(float)); + + std::string output_name = output.first; + if (D == DeviceType::CPU) { + output_name = output.first + "NHWC"; + net.TransformDataFormat(output.first, + NCHW, + output_name, + NHWC); + } + ops::test::ExpectTensorNear(*tmp_tensor, + *net.GetOutput(output_name.data()), + 1e-5); + } +} +} // namespace test +} // namespace mace +#endif // MACE_TEST_MACE_API_TEST_H_ diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 237febcce69f9d849ad3431c502295273bea89b3..1d9eebc9ff5a2897bd70e5c8cac439957c4b9441 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -174,5 +174,16 @@ inline bool EnvEnabled(std::string env_name) { return !(!env || env[0] == 0 || env[0] == '0'); } +template +std::vector TransposeShape(const std::vector &shape, + const std::vector &dst_dims) { + size_t shape_dims = shape.size(); + std::vector output_shape(shape_dims); + for (size_t i = 0; i < shape_dims; ++i) { + output_shape[i] = static_cast(shape[dst_dims[i]]); + } + return output_shape; +} + } // namespace mace #endif // MACE_UTILS_UTILS_H_ diff --git a/tools/converter.py b/tools/converter.py index 6f66dafd3038e97360b0c451cdd4ce33d11e44f6..e98715fc95def1972c376c76c211758b19c6b2b2 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -163,6 +163,16 @@ DSPDataType = Enum('DSPDataType', [(ele, ele) for ele in DSPDataTypeStrs], WinogradParameters = [0, 2, 4] +DataFormatStrs = [ + "NONE", + "NHWC", +] + + +class DataFormat(object): + NONE = "NONE" + NHWC = "NHWC" + class DefaultValues(object): mace_lib_type = MACELibType.static @@ -195,6 +205,8 @@ class YAMLKeyword(object): runtime = 'runtime' data_type = 'data_type' input_data_types = 'input_data_types' + input_data_formats = 'input_data_formats' + output_data_formats = 'output_data_formats' limit_opencl_kernel_time = 'limit_opencl_kernel_time' nnlib_graph_mode = 'nnlib_graph_mode' obfuscate = 'obfuscate' @@ -487,7 +499,7 @@ def format_model_config(flags): if input_data_types: if not isinstance(input_data_types, list): subgraph[YAMLKeyword.input_data_types] = [input_data_types] - for input_data_type in input_data_types: + for input_data_type in subgraph[YAMLKeyword.input_data_types]: mace_check(input_data_type in InputDataTypeStrs, ModuleName.YAML_CONFIG, "'input_data_types' must be in " @@ -495,6 +507,49 @@ def format_model_config(flags): else: subgraph[YAMLKeyword.input_data_types] = [] + input_data_formats = subgraph.get(YAMLKeyword.input_data_formats, + []) + if input_data_formats: + if not isinstance(input_data_formats, list): + subgraph[YAMLKeyword.input_data_formats] =\ + [input_data_formats] + else: + mace_check(len(input_data_formats) + == len(subgraph[YAMLKeyword.input_tensors]), + ModuleName.YAML_CONFIG, + "input_data_formats should match" + " the size of input") + for input_data_format in\ + subgraph[YAMLKeyword.input_data_formats]: + mace_check(input_data_format in DataFormatStrs, + ModuleName.YAML_CONFIG, + "'input_data_formats' must be in " + + str(DataFormatStrs) + ", but got " + + input_data_formats) + else: + subgraph[YAMLKeyword.input_data_formats] = [DataFormat.NHWC] + + output_data_formats = subgraph.get(YAMLKeyword.output_data_formats, + []) + if output_data_formats: + if not isinstance(output_data_formats, list): + subgraph[YAMLKeyword.output_data_formats] = \ + [output_data_formats] + else: + mace_check(len(output_data_formats) + == len(subgraph[YAMLKeyword.output_tensors]), + ModuleName.YAML_CONFIG, + "output_data_formats should match" + " the size of output") + for output_data_format in\ + subgraph[YAMLKeyword.output_data_formats]: + mace_check(output_data_format in DataFormatStrs, + ModuleName.YAML_CONFIG, + "'input_data_formats' must be in " + + str(DataFormatStrs)) + else: + subgraph[YAMLKeyword.output_data_formats] = [DataFormat.NHWC] + validation_threshold = subgraph.get( YAMLKeyword.validation_threshold, {}) if not isinstance(validation_threshold, dict): @@ -803,7 +858,9 @@ def convert_model(configs, cl_mem_type): model_config[YAMLKeyword.model_sha256_checksum], model_config[YAMLKeyword.weight_sha256_checksum], ",".join(subgraphs[0][YAMLKeyword.input_tensors]), + ",".join(subgraphs[0][YAMLKeyword.input_data_formats]), ",".join(subgraphs[0][YAMLKeyword.output_tensors]), + ",".join(subgraphs[0][YAMLKeyword.output_data_formats]), ",".join(subgraphs[0][YAMLKeyword.check_tensors]), runtime, model_name, diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 0eb991296d180225b25bc2cc1429f0de17c10e76..601f5b2cbe45b4898f0683dcd93095dfca333bc1 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -555,7 +555,9 @@ def gen_model_code(model_codegen_dir, model_sha256_checksum, weight_sha256_checksum, input_nodes, + input_data_formats, output_nodes, + output_data_formats, check_nodes, runtime, model_tag, @@ -588,7 +590,9 @@ def gen_model_code(model_codegen_dir, "--model_checksum=%s" % model_sha256_checksum, "--weight_checksum=%s" % weight_sha256_checksum, "--input_node=%s" % input_nodes, + "--input_data_formats=%s" % input_data_formats, "--output_node=%s" % output_nodes, + "--output_data_formats=%s" % output_data_formats, "--check_node=%s" % check_nodes, "--runtime=%s" % runtime, "--template=%s" % "mace/python/tools",