From 02c1048ea3c5007def340cb8415522fbc566c247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Sat, 29 Dec 2018 11:41:33 +0800 Subject: [PATCH] Revert "Merge branch 'shuffle' into 'master'" This reverts merge request !931 --- mace/core/net.cc | 96 ++----------------------------------- mace/core/net.h | 2 - mace/core/workspace.cc | 44 ++++------------- mace/libmace/mace.cc | 8 ++-- mace/ops/channel_shuffle.cc | 10 ++-- 5 files changed, 23 insertions(+), 137 deletions(-) diff --git a/mace/core/net.cc b/mace/core/net.cc index 2aeb951e..7912a6d4 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -14,7 +14,6 @@ #include #include -#include #include #include @@ -64,85 +63,18 @@ bool TransformRequiredOp(const std::string &op_type) { } #endif // MACE_ENABLE_OPENCL - - -// TODO(lichao): Move to runtime driver class after universality done. -// fallback to gpu buffer when kernels are implemented -void FindAvailableDevicesForOp(const OpRegistryBase &op_registry, - const OperatorDef &op, - const std::unordered_map> &tensor_shape_info, - std::set - *available_devices) { - auto devices = op_registry.AvailableDevices(op.type()); - available_devices->insert(devices.begin(), devices.end()); - std::string op_type = op.type(); - // For those whose shape is not 4-rank but can run on GPU - if (op_type == "BufferTransform" - || op_type == "LSTMCell" - || op_type == "FullyConnected" - || op_type == "Softmax" - || op_type == "Squeeze") { - return; - } else { - if (op.output_shape_size() != op.output_size()) { - return; - } - if (op.output_shape(0).dims_size() != 4) { - available_devices->erase(DeviceType::GPU); - } - - if (op_type == "Split") { - if (op.output_shape(0).dims_size() != 4 - || op.output_shape(0).dims()[3] % 4 != 0) { - available_devices->erase(DeviceType::GPU); - } - } else if (op_type == "Concat") { - if (op.output_shape(0).dims_size() != 4) { - available_devices->erase(DeviceType::GPU); - } else { - if (op.input_size() != 2) { - for (const std::string &input : op.input()) { - if (tensor_shape_info.find(input) != tensor_shape_info.end()) { - auto &input_shape = tensor_shape_info.at(input); - if (input_shape[3] % 4 != 0) { - available_devices->erase(DeviceType::GPU); - break; - } - } - } - } - } - } else if (op_type == "ChannelShuffle") { - int groups = ProtoArgHelper::GetOptionalArg( - op, "group", 1); - int channels = op.output_shape(0).dims(3); - int channels_per_group = channels / groups; - if (groups % 4 != 0 || channels_per_group % 4 != 0) { - available_devices->erase(DeviceType::GPU); - } - } - } -} - } // namespace std::unique_ptr SerialNet::CreateOperation( const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - const std::unordered_map> tensor_shape_info, DataFormat data_format_flag, bool is_quantize_model) { // Create the Operation DeviceType target_device_type = target_device_->device_type(); // Get available devices - std::set available_devices; - FindAvailableDevicesForOp(*op_registry, - *op_def, - tensor_shape_info, - &available_devices); + auto available_devices = op_registry->AvailableDevices(op_def->type()); // Find the device type to run the op. // If the target_device_type in available devices, use target_device_type, // otherwise, fallback to CPU device. @@ -161,7 +93,6 @@ std::unique_ptr SerialNet::CreateOperation( } } op_def->set_device_type(device_type); - // transpose output shape if run on CPU (default format is NHWC) if (!is_quantize_model && device_type == DeviceType::CPU && op_def->output_shape_size() == op_def->output_size()) { @@ -208,7 +139,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, MemoryType target_mem_type; // quantize model flag bool is_quantize_model = IsQuantizedModel(*net_def); - + // DataFormat data_format_flag = NHWC; if (target_device_->device_type() == DeviceType::CPU) { target_mem_type = MemoryType::CPU_BUFFER; @@ -232,7 +163,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, target_mem_type, DataType::DT_FLOAT, input_shape, -1)); } } - #ifdef MACE_ENABLE_OPENCL else { // GPU NOLINT[readability/braces] target_mem_type = MemoryType::GPU_BUFFER; @@ -246,22 +176,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } #endif // MACE_ENABLE_OPENCL - std::unordered_map> tensor_shape_info; - for (auto &op : net_def->op()) { - if (op.output_size() != op.output_shape_size()) { - continue; - } - for (int i = 0; i < op.output_size(); ++i) { - tensor_shape_info[op.output(i)] = - std::move(std::vector(op.output_shape(i).dims().begin(), - op.output_shape(i).dims().end())); - } - } - for (auto &tensor : net_def->tensors()) { - tensor_shape_info[tensor.name()] = - std::move(std::vector(tensor.dims().begin(), - tensor.dims().end())); - } OpConstructContext construct_context(ws_); for (int idx = 0; idx < net_def->op_size(); ++idx) { std::shared_ptr op_def(new OperatorDef(net_def->op(idx))); @@ -269,7 +183,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto op = CreateOperation(op_registry, &construct_context, op_def, - tensor_shape_info, data_format_flag, is_quantize_model); #ifdef MACE_ENABLE_OPENCL @@ -298,8 +211,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto &output_info = output_map.at(op_def->input(i)); // check whether the tensor has been transformed if (transformed_set.count(t_input_name) == 0) { - VLOG(1) << "Add Transform operation " << op_def->name() - << " to transform tensor " + VLOG(1) << "Add Transform operation to transform tensor '" << op_def->input(i) << "', from memory type " << output_info.mem_type << " to " << wanted_in_mem_type @@ -322,7 +234,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, op_registry, &construct_context, transform_op_def, - tensor_shape_info, data_format_flag); operators_.emplace_back(std::move(transform_op)); transformed_set.insert(t_input_name); @@ -410,7 +321,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, op_registry, &construct_context, transform_op_def, - tensor_shape_info, output_data_format); operators_.emplace_back(std::move(transform_op)); // where to do graph reference count. diff --git a/mace/core/net.h b/mace/core/net.h index 5362d9ee..10577a57 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -59,8 +59,6 @@ class SerialNet : public NetBase { const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, - const std::unordered_map> tensor_shape_info, DataFormat input_format, bool is_quantize_model = false); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 0d2e58dd..43950a9d 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -328,14 +328,6 @@ void Workspace::RemoveUnusedBuffer() { void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, const unsigned char *model_data, Allocator *alloc) { - std::unordered_set tensor_to_host; - for (auto &op : net_def.op()) { - if (op.device_type() == DeviceType::CPU) { - for (std::string input : op.input()) { - tensor_to_host.insert(input); - } - } - } for (auto &const_tensor : net_def.tensors()) { auto iter = tensor_map_.find(const_tensor.name()); if (iter->second->unused()) { @@ -346,32 +338,16 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, for (const index_t d : const_tensor.dims()) { dims.push_back(d); } - - if (tensor_to_host.find(const_tensor.name()) != tensor_to_host.end()) { - DataType host_data_type = const_tensor.data_type(); - if (host_data_type == DataType::DT_HALF) { - host_data_type = DataType ::DT_FLOAT; - } - std::unique_ptr tensor( - new Tensor(alloc, host_data_type, true, const_tensor.name())); - tensor->Resize(dims); - MACE_CHECK(tensor->size() == const_tensor.data_size(), - "Tensor's data_size not equal with the shape"); - if (const_tensor.data_type() == DataType::DT_HALF) { - Tensor::MappingGuard guard(tensor.get()); - float *dst_data = tensor->mutable_data(); - const half *org_data = reinterpret_cast( - model_data + const_tensor.offset()); - for (index_t i = 0; i < const_tensor.data_size(); ++i) { - dst_data[i] = half_float::half_cast(org_data[i]); - } - } else { - tensor->CopyBytes(model_data + const_tensor.offset(), - const_tensor.data_size() * - GetEnumTypeSize(const_tensor.data_type())); - } - tensor_map_[const_tensor.name()] = std::move(tensor); - } + std::unique_ptr tensor( + new Tensor(alloc, const_tensor.data_type())); + tensor->Resize(dims); + MACE_CHECK(tensor->size() == const_tensor.data_size(), + "Tensor's data_size not equal with the shape"); + tensor->CopyBytes(model_data + const_tensor.offset(), + const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type())); + + tensor_map_[const_tensor.name()] = std::move(tensor); } } tensor_buffer_.reset(nullptr); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 047cdf8e..42959ca8 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init( MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, &mem_optimizer, device_.get())); - if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); - } + MACE_RETURN_IF_ERROR(net_->Init()); #ifdef MACE_ENABLE_HEXAGON } #endif - + if (device_type_ == DeviceType::GPU) { + ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); + } return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 8301ccb5..d4404c61 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -58,12 +58,14 @@ class ChannelShuffleOp : public Operation { #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { + const T *input_base = input_ptr + b * batch_size; + T *output_base = output_ptr + b * batch_size; index_t g = c % groups_; index_t idx = c / groups_; - const T *in_ptr = input_ptr + b * batch_size - + (g * channels_per_group + idx) * image_size; - T *out_ptr = output_ptr + b * batch_size + c * image_size; - memcpy(out_ptr, in_ptr, image_size * sizeof(float)); + for (index_t hw = 0; hw < height * width; ++hw) { + output_base[c * image_size + hw] = input_base[ + (g * channels_per_group + idx) * image_size + hw]; + } } } -- GitLab