From 02c1048ea3c5007def340cb8415522fbc566c247 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Sat, 29 Dec 2018 11:41:33 +0800
Subject: [PATCH] Revert "Merge branch 'shuffle' into 'master'"

This reverts merge request !931
---
 mace/core/net.cc            | 96 ++-----------------------------------
 mace/core/net.h             |  2 -
 mace/core/workspace.cc      | 44 ++++-------------
 mace/libmace/mace.cc        |  8 ++--
 mace/ops/channel_shuffle.cc | 10 ++--
 5 files changed, 23 insertions(+), 137 deletions(-)
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 2aeb951e..7912a6d4 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -14,7 +14,6 @@
 
 #include <algorithm>
 #include <limits>
-#include <set>
 #include <unordered_set>
 #include <utility>
 
@@ -64,85 +63,18 @@ bool TransformRequiredOp(const std::string &op_type) {
 }
 #endif  // MACE_ENABLE_OPENCL
 
-
-
-// TODO(lichao): Move to runtime driver class after universality done.
-// fallback to gpu buffer when kernels are implemented
-void FindAvailableDevicesForOp(const OpRegistryBase &op_registry,
-                               const OperatorDef &op,
-                               const std::unordered_map<std::string,
-                                   std::vector<index_t>> &tensor_shape_info,
-                               std::set<DeviceType>
-                                   *available_devices) {
-  auto devices = op_registry.AvailableDevices(op.type());
-  available_devices->insert(devices.begin(), devices.end());
-  std::string op_type = op.type();
-  // For those whose shape is not 4-rank but can run on GPU
-  if (op_type == "BufferTransform"
-      || op_type == "LSTMCell"
-      || op_type == "FullyConnected"
-      || op_type == "Softmax"
-      || op_type == "Squeeze") {
-    return;
-  } else {
-    if (op.output_shape_size() != op.output_size()) {
-      return;
-    }
-    if (op.output_shape(0).dims_size() != 4) {
-      available_devices->erase(DeviceType::GPU);
-    }
-
-    if (op_type == "Split") {
-      if (op.output_shape(0).dims_size() != 4
-          || op.output_shape(0).dims()[3] % 4 != 0) {
-        available_devices->erase(DeviceType::GPU);
-      }
-    } else if (op_type == "Concat") {
-      if (op.output_shape(0).dims_size() != 4) {
-        available_devices->erase(DeviceType::GPU);
-      } else {
-        if (op.input_size() != 2) {
-          for (const std::string &input : op.input()) {
-            if (tensor_shape_info.find(input) != tensor_shape_info.end()) {
-              auto &input_shape = tensor_shape_info.at(input);
-              if (input_shape[3] % 4 != 0) {
-                available_devices->erase(DeviceType::GPU);
-                break;
-              }
-            }
-          }
-        }
-      }
-    } else if (op_type == "ChannelShuffle") {
-      int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-          op, "group", 1);
-      int channels = op.output_shape(0).dims(3);
-      int channels_per_group = channels / groups;
-      if (groups % 4 != 0 || channels_per_group % 4 != 0) {
-        available_devices->erase(DeviceType::GPU);
-      }
-    }
-  }
-}
-
 }  // namespace
 
 std::unique_ptr<Operation> SerialNet::CreateOperation(
     const OpRegistryBase *op_registry,
     OpConstructContext *construct_context,
     std::shared_ptr<OperatorDef> op_def,
-    const std::unordered_map<std::string,
-                             std::vector<index_t>> tensor_shape_info,
     DataFormat data_format_flag,
     bool is_quantize_model) {
   // Create the Operation
   DeviceType target_device_type = target_device_->device_type();
   // Get available devices
-  std::set<DeviceType> available_devices;
-  FindAvailableDevicesForOp(*op_registry,
-                            *op_def,
-                            tensor_shape_info,
-                            &available_devices);
+  auto available_devices = op_registry->AvailableDevices(op_def->type());
   // Find the device type to run the op.
   // If the target_device_type in available devices, use target_device_type,
   // otherwise, fallback to CPU device.
@@ -161,7 +93,6 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
     }
   }
   op_def->set_device_type(device_type);
-
   // transpose output shape if run on CPU (default format is NHWC)
   if (!is_quantize_model && device_type == DeviceType::CPU &&
       op_def->output_shape_size() == op_def->output_size()) {
@@ -208,7 +139,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   MemoryType target_mem_type;
   // quantize model flag
   bool is_quantize_model = IsQuantizedModel(*net_def);
-
+  //
   DataFormat data_format_flag = NHWC;
   if (target_device_->device_type() == DeviceType::CPU) {
     target_mem_type = MemoryType::CPU_BUFFER;
@@ -232,7 +163,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
           target_mem_type, DataType::DT_FLOAT, input_shape, -1));
     }
   }
-
 #ifdef MACE_ENABLE_OPENCL
   else {  // GPU  NOLINT[readability/braces]
     target_mem_type = MemoryType::GPU_BUFFER;
@@ -246,22 +176,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   }
 #endif  // MACE_ENABLE_OPENCL
 
-  std::unordered_map<std::string, std::vector<index_t>> tensor_shape_info;
-  for (auto &op : net_def->op()) {
-    if (op.output_size() != op.output_shape_size()) {
-      continue;
-    }
-    for (int i = 0; i < op.output_size(); ++i) {
-      tensor_shape_info[op.output(i)] =
-          std::move(std::vector<index_t>(op.output_shape(i).dims().begin(),
-                                         op.output_shape(i).dims().end()));
-    }
-  }
-  for (auto &tensor : net_def->tensors()) {
-    tensor_shape_info[tensor.name()] =
-        std::move(std::vector<index_t>(tensor.dims().begin(),
-                                       tensor.dims().end()));
-  }
   OpConstructContext construct_context(ws_);
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
@@ -269,7 +183,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
     auto op = CreateOperation(op_registry,
                               &construct_context,
                               op_def,
-                              tensor_shape_info,
                               data_format_flag,
                               is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
@@ -298,8 +211,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
               auto &output_info = output_map.at(op_def->input(i));
               // check whether the tensor has been transformed
               if (transformed_set.count(t_input_name) == 0) {
-                VLOG(1) << "Add Transform operation " << op_def->name()
-                        << " to transform tensor "
+                VLOG(1) << "Add Transform operation to transform tensor '"
                         << op_def->input(i) << "', from memory type "
                         << output_info.mem_type << " to "
                         << wanted_in_mem_type
@@ -322,7 +234,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     op_registry,
                     &construct_context,
                     transform_op_def,
-                    tensor_shape_info,
                     data_format_flag);
                 operators_.emplace_back(std::move(transform_op));
                 transformed_set.insert(t_input_name);
@@ -410,7 +321,6 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
             op_registry,
             &construct_context,
             transform_op_def,
-            tensor_shape_info,
             output_data_format);
         operators_.emplace_back(std::move(transform_op));
         // where to do graph reference count.
diff --git a/mace/core/net.h b/mace/core/net.h
index 5362d9ee..10577a57 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,8 +59,6 @@ class SerialNet : public NetBase {
       const OpRegistryBase *op_registry,
       OpConstructContext *construct_context,
       std::shared_ptr<OperatorDef> op_def,
-      const std::unordered_map<std::string,
-                               std::vector<index_t>> tensor_shape_info,
       DataFormat input_format,
       bool is_quantize_model = false);
 
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 0d2e58dd..43950a9d 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -328,14 +328,6 @@ void Workspace::RemoveUnusedBuffer() {
 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
                                       const unsigned char *model_data,
                                       Allocator *alloc) {
-  std::unordered_set<std::string> tensor_to_host;
-  for (auto &op : net_def.op()) {
-    if (op.device_type() == DeviceType::CPU) {
-      for (std::string input : op.input()) {
-        tensor_to_host.insert(input);
-      }
-    }
-  }
   for (auto &const_tensor : net_def.tensors()) {
     auto iter = tensor_map_.find(const_tensor.name());
     if (iter->second->unused()) {
@@ -346,32 +338,16 @@ void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
       for (const index_t d : const_tensor.dims()) {
         dims.push_back(d);
       }
-
-      if (tensor_to_host.find(const_tensor.name()) != tensor_to_host.end()) {
-        DataType host_data_type = const_tensor.data_type();
-        if (host_data_type == DataType::DT_HALF) {
-          host_data_type = DataType ::DT_FLOAT;
-        }
-        std::unique_ptr<Tensor> tensor(
-            new Tensor(alloc, host_data_type, true, const_tensor.name()));
-        tensor->Resize(dims);
-        MACE_CHECK(tensor->size() == const_tensor.data_size(),
-                   "Tensor's data_size not equal with the shape");
-        if (const_tensor.data_type() == DataType::DT_HALF) {
-          Tensor::MappingGuard guard(tensor.get());
-          float *dst_data = tensor->mutable_data<float>();
-          const half *org_data = reinterpret_cast<const half *>(
-              model_data + const_tensor.offset());
-          for (index_t i = 0; i < const_tensor.data_size(); ++i) {
-            dst_data[i] = half_float::half_cast<float>(org_data[i]);
-          }
-        } else {
-          tensor->CopyBytes(model_data + const_tensor.offset(),
-                            const_tensor.data_size() *
-                                GetEnumTypeSize(const_tensor.data_type()));
-        }
-        tensor_map_[const_tensor.name()] = std::move(tensor);
-      }
+      std::unique_ptr<Tensor> tensor(
+          new Tensor(alloc, const_tensor.data_type()));
+      tensor->Resize(dims);
+      MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                 "Tensor's data_size not equal with the shape");
+      tensor->CopyBytes(model_data + const_tensor.offset(),
+                        const_tensor.data_size() *
+                            GetEnumTypeSize(const_tensor.data_type()));
+
+      tensor_map_[const_tensor.name()] = std::move(tensor);
     }
   }
   tensor_buffer_.reset(nullptr);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 047cdf8e..42959ca8 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init(
     MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
                                                       &mem_optimizer,
                                                       device_.get()));
-    if (device_type_ == DeviceType::GPU) {
-      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
-    }
+
     MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
-
+  if (device_type_ == DeviceType::GPU) {
+    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
+  }
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 8301ccb5..d4404c61 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -58,12 +58,14 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
 #pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
+        const T *input_base = input_ptr + b * batch_size;
+        T *output_base = output_ptr + b * batch_size;
         index_t g = c % groups_;
         index_t idx = c / groups_;
-        const T *in_ptr = input_ptr + b * batch_size
-            + (g * channels_per_group + idx) * image_size;
-        T *out_ptr = output_ptr + b * batch_size + c * image_size;
-        memcpy(out_ptr, in_ptr, image_size * sizeof(float));
+        for (index_t hw = 0; hw < height * width; ++hw) {
+          output_base[c * image_size + hw] = input_base[
+              (g * channels_per_group + idx) * image_size + hw];
+        }
       }
     }
 
-- 
GitLab