Merge branch 'gpu_fix' into 'master'

Temporary solution for cpu/gpu runtime fallback. Fix several issues See merge request !939

Merge branch 'gpu_fix' into 'master'
Temporary solution for cpu/gpu runtime fallback. Fix several issues See merge request !939
3bd4df8f · 刘琦 · 946e4c9e · abdc7759 · 3bd4df8f · 3bd4df8f
5 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -14,6 +14,7 @@

 #include <algorithm>
 #include <limits>
+#include <set>
 #include <unordered_set>
 #include <utility>

@@ -63,18 +64,85 @@ bool TransformRequiredOp(const std::string &op_type) {
 }
 #endif  // MACE_ENABLE_OPENCL

+
+
+// TODO(lichao): Move to runtime driver class after universality done.
+// fallback to gpu buffer when kernels are implemented
+void FindAvailableDevicesForOp(const OpRegistryBase &op_registry,
+                               const OperatorDef &op,
+                               const std::unordered_map<std::string,
+                                   std::vector<index_t>> &tensor_shape_info,
+                               std::set<DeviceType>
+                                   *available_devices) {
+  auto devices = op_registry.AvailableDevices(op.type());
+  available_devices->insert(devices.begin(), devices.end());
+  std::string op_type = op.type();
+  // For those whose shape is not 4-rank but can run on GPU
+  if (op_type == "BufferTransform"
+      || op_type == "LSTMCell"
+      || op_type == "FullyConnected"
+      || op_type == "Softmax"
+      || op_type == "Squeeze") {
+    return;
+  } else {
+    if (op.output_shape_size() != op.output_size()) {
+      return;
+    }
+    if (op.output_shape(0).dims_size() != 4) {
+      available_devices->erase(DeviceType::GPU);
+    }
+
+    if (op_type == "Split") {
+      if (op.output_shape(0).dims_size() != 4
+          || op.output_shape(0).dims()[3] % 4 != 0) {
+        available_devices->erase(DeviceType::GPU);
+      }
+    } else if (op_type == "Concat") {
+      if (op.output_shape(0).dims_size() != 4) {
+        available_devices->erase(DeviceType::GPU);
+      } else {
+        if (op.input_size() != 2) {
+          for (const std::string &input : op.input()) {
+            if (tensor_shape_info.find(input) != tensor_shape_info.end()) {
+              auto &input_shape = tensor_shape_info.at(input);
+              if (input_shape[3] % 4 != 0) {
+                available_devices->erase(DeviceType::GPU);
+                break;
+              }
+            }
+          }
+        }
+      }
+    } else if (op_type == "ChannelShuffle") {
+      int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          op, "group", 1);
+      int channels = op.output_shape(0).dims(3);
+      int channels_per_group = channels / groups;
+      if (groups % 4 != 0 || channels_per_group % 4 != 0) {
+        available_devices->erase(DeviceType::GPU);
+      }
+    }
+  }
+}
+
 }  // namespace

 std::unique_ptr<Operation> SerialNet::CreateOperation(
    const OpRegistryBase *op_registry,
    OpConstructContext *construct_context,
    std::shared_ptr<OperatorDef> op_def,
+    const std::unordered_map<std::string,
+                             std::vector<index_t>> tensor_shape_info,
    DataFormat data_format_flag,
    bool is_quantize_model) {
  // Create the Operation
  DeviceType target_device_type = target_device_->device_type();
  // Get available devices
-  auto available_devices = op_registry->AvailableDevices(op_def->type());
+  std::set<DeviceType> available_devices;
+  FindAvailableDevicesForOp(*op_registry,
+                            *op_def,
+                            tensor_shape_info,
+                            &available_devices);
  // Find the device type to run the op.
  // If the target_device_type in available devices, use target_device_type,
  // otherwise, fallback to CPU device.
@@ -93,6 +161,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
    }
  }
  op_def->set_device_type(device_type);
+
  // transpose output shape if run on CPU (default format is NHWC)
  if (!is_quantize_model && device_type == DeviceType::CPU &&
      op_def->output_shape_size() == op_def->output_size()) {
@@ -139,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
  MemoryType target_mem_type;
  // quantize model flag
  bool is_quantize_model = IsQuantizedModel(*net_def);
-  //
+
  DataFormat data_format_flag = NHWC;
  if (target_device_->device_type() == DeviceType::CPU) {
    target_mem_type = MemoryType::CPU_BUFFER;
@@ -163,6 +232,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
          target_mem_type, DataType::DT_FLOAT, input_shape, -1));
    }
  }
+
 #ifdef MACE_ENABLE_OPENCL
  else {  // GPU  NOLINT[readability/braces]
    target_mem_type = MemoryType::GPU_BUFFER;
@@ -176,6 +246,22 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
  }
 #endif  // MACE_ENABLE_OPENCL

+  std::unordered_map<std::string, std::vector<index_t>> tensor_shape_info;
+  for (auto &op : net_def->op()) {
+    if (op.output_size() != op.output_shape_size()) {
+      continue;
+    }
+    for (int i = 0; i < op.output_size(); ++i) {
+      tensor_shape_info[op.output(i)] =
+          std::move(std::vector<index_t>(op.output_shape(i).dims().begin(),
+                                         op.output_shape(i).dims().end()));
+    }
+  }
+  for (auto &tensor : net_def->tensors()) {
+    tensor_shape_info[tensor.name()] =
+        std::move(std::vector<index_t>(tensor.dims().begin(),
+                                       tensor.dims().end()));
+  }
  OpConstructContext construct_context(ws_);
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
@@ -183,6 +269,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
    auto op = CreateOperation(op_registry,
                              &construct_context,
                              op_def,
+                              tensor_shape_info,
                              data_format_flag,
                              is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
@@ -211,7 +298,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
              auto &output_info = output_map.at(op_def->input(i));
              // check whether the tensor has been transformed
              if (transformed_set.count(t_input_name) == 0) {
-                VLOG(1) << "Add Transform operation to transform tensor '"
+                VLOG(1) << "Add Transform operation " << op_def->name()
+                        << " to transform tensor "
                        << op_def->input(i) << "', from memory type "
                        << output_info.mem_type << " to "
                        << wanted_in_mem_type
@@ -234,6 +322,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                    op_registry,
                    &construct_context,
                    transform_op_def,
+                    tensor_shape_info,
                    data_format_flag);
                operators_.emplace_back(std::move(transform_op));
                transformed_set.insert(t_input_name);
@@ -321,6 +410,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            op_registry,
            &construct_context,
            transform_op_def,
+            tensor_shape_info,
            output_data_format);
        operators_.emplace_back(std::move(transform_op));
        // where to do graph reference count.

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,6 +59,8 @@ class SerialNet : public NetBase {
      const OpRegistryBase *op_registry,
      OpConstructContext *construct_context,
      std::shared_ptr<OperatorDef> op_def,
+      const std::unordered_map<std::string,
+                               std::vector<index_t>> tensor_shape_info,
      DataFormat input_format,
      bool is_quantize_model = false);


--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -131,7 +131,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        }

        std::unique_ptr<Tensor> tensor(
-            new Tensor(device->allocator(), dst_data_type, true));
+            new Tensor(device->allocator(), dst_data_type, true,
+                       const_tensor.name()));
        tensor->Resize(dims);

        MACE_CHECK(tensor->size() == const_tensor.data_size(),
@@ -328,26 +329,52 @@ void Workspace::RemoveUnusedBuffer() {
 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
                                      const unsigned char *model_data,
                                      Allocator *alloc) {
+  std::unordered_set<std::string> tensor_to_host;
+  for (auto &op : net_def.op()) {
+    if (op.device_type() == DeviceType::CPU) {
+      for (std::string input : op.input()) {
+        tensor_to_host.insert(input);
+      }
+    }
+  }
  for (auto &const_tensor : net_def.tensors()) {
    auto iter = tensor_map_.find(const_tensor.name());
    if (iter->second->unused()) {
      tensor_map_.erase(iter);
-    } else if (!diffused_buffer_) {
-      tensor_map_.erase(iter);
+    } else {
      std::vector<index_t> dims;
      for (const index_t d : const_tensor.dims()) {
        dims.push_back(d);
      }
-      std::unique_ptr<Tensor> tensor(
-          new Tensor(alloc, const_tensor.data_type()));
-      tensor->Resize(dims);
-      MACE_CHECK(tensor->size() == const_tensor.data_size(),
-                 "Tensor's data_size not equal with the shape");
-      tensor->CopyBytes(model_data + const_tensor.offset(),
-                        const_tensor.data_size() *
-                            GetEnumTypeSize(const_tensor.data_type()));
-
-      tensor_map_[const_tensor.name()] = std::move(tensor);
+
+      if (tensor_to_host.find(const_tensor.name()) != tensor_to_host.end()
+          && const_tensor.data_type() == DataType::DT_HALF) {
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(alloc, DataType::DT_FLOAT,
+                       true, const_tensor.name()));
+        tensor->Resize(dims);
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        Tensor::MappingGuard guard(tensor.get());
+        float *dst_data = tensor->mutable_data<float>();
+        const half *org_data = reinterpret_cast<const half *>(
+            model_data + const_tensor.offset());
+        for (index_t i = 0; i < const_tensor.data_size(); ++i) {
+          dst_data[i] = half_float::half_cast<float>(org_data[i]);
+        }
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      } else if (!diffused_buffer_) {
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(alloc, const_tensor.data_type(),
+                       true, const_tensor.name()));
+        tensor->Resize(dims);
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        tensor->CopyBytes(model_data + const_tensor.offset(),
+                          const_tensor.data_size() *
+                              GetEnumTypeSize(const_tensor.data_type()));
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      }
    }
  }
  tensor_buffer_.reset(nullptr);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init(
    MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
                                                      &mem_optimizer,
                                                      device_.get()));
-
+    if (device_type_ == DeviceType::GPU) {
+      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
+    }
    MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
  }
 #endif
-  if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
-  }
+
  return MaceStatus::MACE_SUCCESS;
 }


--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -58,14 +58,12 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
 #pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < batch; ++b) {
      for (index_t c = 0; c < channels; ++c) {
-        const T *input_base = input_ptr + b * batch_size;
-        T *output_base = output_ptr + b * batch_size;
        index_t g = c % groups_;
        index_t idx = c / groups_;
-        for (index_t hw = 0; hw < height * width; ++hw) {
-          output_base[c * image_size + hw] = input_base[
-              (g * channels_per_group + idx) * image_size + hw];
-        }
+        const T *in_ptr = input_ptr + b * batch_size
+            + (g * channels_per_group + idx) * image_size;
+        T *out_ptr = output_ptr + b * batch_size + c * image_size;
+        memcpy(out_ptr, in_ptr, image_size * sizeof(float));
      }
    }