diff --git a/mace/core/net.cc b/mace/core/net.cc index 7912a6d4209808c25b7b33b47806f3eedf81112b..2aeb951e92fe8a44cb814caf6ab13eaf5c6bae7c 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -63,18 +64,85 @@ bool TransformRequiredOp(const std::string &op_type) { } #endif // MACE_ENABLE_OPENCL + + +// TODO(lichao): Move to runtime driver class after universality done. +// fallback to gpu buffer when kernels are implemented +void FindAvailableDevicesForOp(const OpRegistryBase &op_registry, + const OperatorDef &op, + const std::unordered_map> &tensor_shape_info, + std::set + *available_devices) { + auto devices = op_registry.AvailableDevices(op.type()); + available_devices->insert(devices.begin(), devices.end()); + std::string op_type = op.type(); + // For those whose shape is not 4-rank but can run on GPU + if (op_type == "BufferTransform" + || op_type == "LSTMCell" + || op_type == "FullyConnected" + || op_type == "Softmax" + || op_type == "Squeeze") { + return; + } else { + if (op.output_shape_size() != op.output_size()) { + return; + } + if (op.output_shape(0).dims_size() != 4) { + available_devices->erase(DeviceType::GPU); + } + + if (op_type == "Split") { + if (op.output_shape(0).dims_size() != 4 + || op.output_shape(0).dims()[3] % 4 != 0) { + available_devices->erase(DeviceType::GPU); + } + } else if (op_type == "Concat") { + if (op.output_shape(0).dims_size() != 4) { + available_devices->erase(DeviceType::GPU); + } else { + if (op.input_size() != 2) { + for (const std::string &input : op.input()) { + if (tensor_shape_info.find(input) != tensor_shape_info.end()) { + auto &input_shape = tensor_shape_info.at(input); + if (input_shape[3] % 4 != 0) { + available_devices->erase(DeviceType::GPU); + break; + } + } + } + } + } + } else if (op_type == "ChannelShuffle") { + int groups = ProtoArgHelper::GetOptionalArg( + op, "group", 1); + int channels = op.output_shape(0).dims(3); + int channels_per_group = channels / groups; + if (groups % 4 != 0 || channels_per_group % 4 != 0) { + available_devices->erase(DeviceType::GPU); + } + } + } +} + } // namespace std::unique_ptr SerialNet::CreateOperation( const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, + const std::unordered_map> tensor_shape_info, DataFormat data_format_flag, bool is_quantize_model) { // Create the Operation DeviceType target_device_type = target_device_->device_type(); // Get available devices - auto available_devices = op_registry->AvailableDevices(op_def->type()); + std::set available_devices; + FindAvailableDevicesForOp(*op_registry, + *op_def, + tensor_shape_info, + &available_devices); // Find the device type to run the op. // If the target_device_type in available devices, use target_device_type, // otherwise, fallback to CPU device. @@ -93,6 +161,7 @@ std::unique_ptr SerialNet::CreateOperation( } } op_def->set_device_type(device_type); + // transpose output shape if run on CPU (default format is NHWC) if (!is_quantize_model && device_type == DeviceType::CPU && op_def->output_shape_size() == op_def->output_size()) { @@ -139,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, MemoryType target_mem_type; // quantize model flag bool is_quantize_model = IsQuantizedModel(*net_def); - // + DataFormat data_format_flag = NHWC; if (target_device_->device_type() == DeviceType::CPU) { target_mem_type = MemoryType::CPU_BUFFER; @@ -163,6 +232,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, target_mem_type, DataType::DT_FLOAT, input_shape, -1)); } } + #ifdef MACE_ENABLE_OPENCL else { // GPU NOLINT[readability/braces] target_mem_type = MemoryType::GPU_BUFFER; @@ -176,6 +246,22 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, } #endif // MACE_ENABLE_OPENCL + std::unordered_map> tensor_shape_info; + for (auto &op : net_def->op()) { + if (op.output_size() != op.output_shape_size()) { + continue; + } + for (int i = 0; i < op.output_size(); ++i) { + tensor_shape_info[op.output(i)] = + std::move(std::vector(op.output_shape(i).dims().begin(), + op.output_shape(i).dims().end())); + } + } + for (auto &tensor : net_def->tensors()) { + tensor_shape_info[tensor.name()] = + std::move(std::vector(tensor.dims().begin(), + tensor.dims().end())); + } OpConstructContext construct_context(ws_); for (int idx = 0; idx < net_def->op_size(); ++idx) { std::shared_ptr op_def(new OperatorDef(net_def->op(idx))); @@ -183,6 +269,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto op = CreateOperation(op_registry, &construct_context, op_def, + tensor_shape_info, data_format_flag, is_quantize_model); #ifdef MACE_ENABLE_OPENCL @@ -211,7 +298,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, auto &output_info = output_map.at(op_def->input(i)); // check whether the tensor has been transformed if (transformed_set.count(t_input_name) == 0) { - VLOG(1) << "Add Transform operation to transform tensor '" + VLOG(1) << "Add Transform operation " << op_def->name() + << " to transform tensor " << op_def->input(i) << "', from memory type " << output_info.mem_type << " to " << wanted_in_mem_type @@ -234,6 +322,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, op_registry, &construct_context, transform_op_def, + tensor_shape_info, data_format_flag); operators_.emplace_back(std::move(transform_op)); transformed_set.insert(t_input_name); @@ -321,6 +410,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry, op_registry, &construct_context, transform_op_def, + tensor_shape_info, output_data_format); operators_.emplace_back(std::move(transform_op)); // where to do graph reference count. diff --git a/mace/core/net.h b/mace/core/net.h index 10577a572f5a0629ae515d9b330befbaa639016e..5362d9ee4b8630a894cbb5705d6503bce2ed85f2 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -59,6 +59,8 @@ class SerialNet : public NetBase { const OpRegistryBase *op_registry, OpConstructContext *construct_context, std::shared_ptr op_def, + const std::unordered_map> tensor_shape_info, DataFormat input_format, bool is_quantize_model = false); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 43950a9db00a76ba84b35cb519c4c0c30ded6263..40ab5839ea41d29b610f4872df63bfd47c0d644f 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -131,7 +131,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, } std::unique_ptr tensor( - new Tensor(device->allocator(), dst_data_type, true)); + new Tensor(device->allocator(), dst_data_type, true, + const_tensor.name())); tensor->Resize(dims); MACE_CHECK(tensor->size() == const_tensor.data_size(), @@ -328,26 +329,52 @@ void Workspace::RemoveUnusedBuffer() { void Workspace::RemoveAndReloadBuffer(const NetDef &net_def, const unsigned char *model_data, Allocator *alloc) { + std::unordered_set tensor_to_host; + for (auto &op : net_def.op()) { + if (op.device_type() == DeviceType::CPU) { + for (std::string input : op.input()) { + tensor_to_host.insert(input); + } + } + } for (auto &const_tensor : net_def.tensors()) { auto iter = tensor_map_.find(const_tensor.name()); if (iter->second->unused()) { tensor_map_.erase(iter); - } else if (!diffused_buffer_) { - tensor_map_.erase(iter); + } else { std::vector dims; for (const index_t d : const_tensor.dims()) { dims.push_back(d); } - std::unique_ptr tensor( - new Tensor(alloc, const_tensor.data_type())); - tensor->Resize(dims); - MACE_CHECK(tensor->size() == const_tensor.data_size(), - "Tensor's data_size not equal with the shape"); - tensor->CopyBytes(model_data + const_tensor.offset(), - const_tensor.data_size() * - GetEnumTypeSize(const_tensor.data_type())); - - tensor_map_[const_tensor.name()] = std::move(tensor); + + if (tensor_to_host.find(const_tensor.name()) != tensor_to_host.end() + && const_tensor.data_type() == DataType::DT_HALF) { + std::unique_ptr tensor( + new Tensor(alloc, DataType::DT_FLOAT, + true, const_tensor.name())); + tensor->Resize(dims); + MACE_CHECK(tensor->size() == const_tensor.data_size(), + "Tensor's data_size not equal with the shape"); + Tensor::MappingGuard guard(tensor.get()); + float *dst_data = tensor->mutable_data(); + const half *org_data = reinterpret_cast( + model_data + const_tensor.offset()); + for (index_t i = 0; i < const_tensor.data_size(); ++i) { + dst_data[i] = half_float::half_cast(org_data[i]); + } + tensor_map_[const_tensor.name()] = std::move(tensor); + } else if (!diffused_buffer_) { + std::unique_ptr tensor( + new Tensor(alloc, const_tensor.data_type(), + true, const_tensor.name())); + tensor->Resize(dims); + MACE_CHECK(tensor->size() == const_tensor.data_size(), + "Tensor's data_size not equal with the shape"); + tensor->CopyBytes(model_data + const_tensor.offset(), + const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type())); + tensor_map_[const_tensor.name()] = std::move(tensor); + } } } tensor_buffer_.reset(nullptr); diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 42959ca877e42002fb567138fb528ca11c086e6f..047cdf8e9e8d68d7254eb917bbfda3513db6015c 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init( MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def, &mem_optimizer, device_.get())); - + if (device_type_ == DeviceType::GPU) { + ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); + } MACE_RETURN_IF_ERROR(net_->Init()); #ifdef MACE_ENABLE_HEXAGON } #endif - if (device_type_ == DeviceType::GPU) { - ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator()); - } + return MaceStatus::MACE_SUCCESS; } diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index d4404c618d0a06c75892782fab7bcd48866e5ebc..8301ccb54681bcf5fc1e521ec603ede8fc2d205f 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -58,14 +58,12 @@ class ChannelShuffleOp : public Operation { #pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { - const T *input_base = input_ptr + b * batch_size; - T *output_base = output_ptr + b * batch_size; index_t g = c % groups_; index_t idx = c / groups_; - for (index_t hw = 0; hw < height * width; ++hw) { - output_base[c * image_size + hw] = input_base[ - (g * channels_per_group + idx) * image_size + hw]; - } + const T *in_ptr = input_ptr + b * batch_size + + (g * channels_per_group + idx) * image_size; + T *out_ptr = output_ptr + b * batch_size + c * image_size; + memcpy(out_ptr, in_ptr, image_size * sizeof(float)); } }