Bug: Support auto transformation between cpu and gpu more reasonable

696cf9d9 · liuqi · bfbe1a30 · 696cf9d9 · 696cf9d9 · 696cf9d9
12 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <utility>
 #include <algorithm>
 #include <limits>
+#include <unordered_set>
+#include <utility>

 #include "mace/core/future.h"
 #include "mace/core/macros.h"
@@ -53,6 +54,13 @@ std::string TransformedName(const std::string &input_name,
  ss << input_name << "_mem_type_" << mem_type;
  return ss.str();
 }
+
+bool TransformRequiredOp(const std::string &op_type) {
+  static const std::unordered_set<std::string> kNoTransformOp = {
+      "Shape", "InferConv2dShape"
+  };
+  return kNoTransformOp.count(op_type) == 0;
+}
 #endif  // MACE_ENABLE_OPENCL

 }  // namespace
@@ -72,6 +80,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
  // otherwise, fallback to CPU device.
  DeviceType device_type = DeviceType::CPU;
  construct_context->set_device(cpu_device_);
+  construct_context->set_operator_def(op_def);
  construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
  for (auto device : available_devices) {
    if (device == target_device_type) {
@@ -103,7 +112,6 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
      }
    }
  }
-  construct_context->set_operator_def(op_def);
  std::unique_ptr<Operation> op(
      op_registry->CreateOperation(construct_context, device_type));
  return std::move(op);
@@ -126,7 +134,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
  std::unordered_map<std::string, InternalOutputInfo> output_map;
  // used for memory optimization
  std::unordered_map<std::string, MemoryType> output_mem_map;
-  std::unordered_map<std::string, std::string> transformed_map;
+  std::unordered_set<std::string> transformed_set;
  // add input information
  MemoryType target_mem_type;
  // quantize model flag
@@ -180,71 +188,80 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 #ifdef MACE_ENABLE_OPENCL
    // Add input transform operation if necessary
    if (target_device_->device_type() == DeviceType::GPU) {
-      const DataType dt =
-          static_cast<DataType>(
-              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
      // the outputs' memory type of the operation
      MemoryType out_mem_type = construct_context.output_mem_type();
      int input_size = op_def->input_size();
-      for (int i = 0; i < input_size; ++i) {
-        if (output_map.count(op_def->input(i)) == 1) {
-          // if op is memory-reuse op, no transformation
-          if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
-            out_mem_type = output_map.at(op_def->input(i)).mem_type;
-            break;
-          }
-          // check whether is the output tensor of other operation
-          if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
-              output_map.at(op_def->input(i)).dtype != dt) {
-            auto key = TransformedName(op_def->input(i), out_mem_type);
-            auto &output_info = output_map.at(op_def->input(i));
-            // check whether the tensor has been transformed
-            if (transformed_map.count(key) == 0) {
-              VLOG(1) << "Add Transform operation to transform tensor '"
-                      << op_def->input(i) << "', from memory type "
-                      << output_info.mem_type << " to " << out_mem_type
-                      << ", from Data Type " << output_info.dtype << " to "
-                      << dt;
-              std::string input_name = op_def->input(i);
-              std::string t_input_name =
-                  TransformedName(input_name,
-                                  out_mem_type);
-              op_def->set_input(i, t_input_name);
-              auto input_shape = output_info.shape;
-              if (output_info.mem_type == MemoryType::CPU_BUFFER &&
-                  input_shape.size() == 4) {
-                // NCHW -> NHWC
-                input_shape =
-                    TransposeShape<index_t, index_t>(input_shape,
-                                                     {0, 2, 3, 1});
+      // if op is memory-unused op, no transformation
+      if (TransformRequiredOp(op_def->type())) {
+        for (int i = 0; i < input_size; ++i) {
+          if (output_map.count(op_def->input(i)) == 1) {
+            // if op is memory-reuse op, no transformation
+            if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
+              out_mem_type = output_map.at(op_def->input(i)).mem_type;
+              break;
+            }
+            // check whether to do transform
+            MemoryType wanted_in_mem_type =
+                construct_context.GetInputMemType(i);
+            DataType wanted_in_dt = construct_context.GetInputDataType(i);
+            if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
+                || output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
+              auto t_input_name = TransformedName(op_def->input(i),
+                                                  wanted_in_mem_type);
+              auto &output_info = output_map.at(op_def->input(i));
+              // check whether the tensor has been transformed
+              if (transformed_set.count(t_input_name) == 0) {
+                VLOG(1) << "Add Transform operation to transform tensor '"
+                        << op_def->input(i) << "', from memory type "
+                        << output_info.mem_type << " to "
+                        << wanted_in_mem_type
+                        << ", from Data Type " << output_info.dtype << " to "
+                        << wanted_in_dt;
+                std::string input_name = op_def->input(i);
+                op_def->set_input(i, t_input_name);
+                auto input_shape = output_info.shape;
+                if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                    input_shape.size() == 4) {
+                  // NCHW -> NHWC
+                  input_shape =
+                      TransposeShape<index_t, index_t>(input_shape,
+                                                       {0, 2, 3, 1});
+                }
+                auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+                    input_name, input_shape, t_input_name,
+                    wanted_in_dt, wanted_in_mem_type);
+                auto transform_op = CreateOperation(
+                    op_registry,
+                    &construct_context,
+                    transform_op_def,
+                    data_format_flag);
+                operators_.emplace_back(std::move(transform_op));
+                transformed_set.insert(t_input_name);
+                output_mem_map[t_input_name] = wanted_in_mem_type;
+                // where to do graph reference count.
+                mem_optimizer->UpdateTensorRef(transform_op_def.get());
+              } else {
+                op_def->set_input(i, t_input_name);
              }
-              auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-                  input_name, input_shape, t_input_name,
-                  dt, out_mem_type);
-              auto transform_op = CreateOperation(
-                  op_registry,
-                  &construct_context,
-                  transform_op_def,
-                  data_format_flag);
-              operators_.emplace_back(std::move(transform_op));
-              transformed_map.emplace(key, t_input_name);
-              output_mem_map[t_input_name] = out_mem_type;
-              // where to do graph reference count.
-              mem_optimizer->UpdateTensorRef(transform_op_def.get());
-            } else {
-              op_def->set_input(i, transformed_map[key]);
            }
+          } else {
+            MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
+                           && ws_->GetTensor(op_def->input(i))->is_weight(),
+                       "Tensor ", op_def->input(i), " of ",
+                       op_def->name(), " not allocated");
          }
-        } else {
-          MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
-                         && ws_->GetTensor(op_def->input(i))->is_weight(),
-                     "Tensor ", op_def->input(i), " of ",
-                     op_def->name(), " not allocated");
        }
      }
      // update the map : output_tensor -> Operation
      for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        DataType dt;
+        if (op_def->output_type_size() == op_def->output_size()) {
+          dt = op_def->output_type(out_idx);
+        } else {
+          dt = static_cast<DataType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
+        }
        output_mem_map[op_def->output(out_idx)] = out_mem_type;
        output_map.emplace(
            op_def->output(out_idx),
@@ -272,13 +289,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
      auto &internal_output_info = output_map.at(output_info.name());
      if ((internal_output_info.mem_type != target_mem_type &&
          internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
-          internal_output_info.dtype != DataType::DT_FLOAT) {
+          internal_output_info.dtype != output_info.data_type()) {
        VLOG(1) << "Add Transform operation to transform output tensor '"
                << output_info.name() << "', from memory type "
                << internal_output_info.mem_type
                << " to " << target_mem_type
                << ", from Data Type " << internal_output_info.dtype
-                << " to " << DataType::DT_FLOAT;
+                << " to " << output_info.data_type();
        std::string t_output_name = TransformedName(output_info.name(),
            target_mem_type);
        auto output_op_def =
@@ -298,7 +315,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
            t_output_name,
            internal_output_info.shape,
            output_info.name(),
-            DataType::DT_FLOAT,
+            output_info.data_type(),
            target_mem_type);
        auto transform_op = CreateOperation(
            op_registry,

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -24,6 +24,57 @@ namespace mace {
 OpConstructContext::OpConstructContext(Workspace *ws)
    : operator_def_(nullptr), ws_(ws), device_(nullptr) {}

+void OpConstructContext::set_operator_def(
+    std::shared_ptr<mace::OperatorDef> operator_def) {
+  operator_def_ = operator_def;
+  input_data_types_.clear();
+}
+
+void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
+  MACE_CHECK(operator_def_ != nullptr);
+  output_mem_type_ = type;
+  input_mem_types_.clear();
+}
+
+void OpConstructContext::SetInputInfo(size_t idx,
+                                      mace::MemoryType mem_type,
+                                      mace::DataType dt) {
+  if (input_mem_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
+  }
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    DataType op_dt = static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+    input_data_types_.resize(operator_def_->input_size(), op_dt);
+  }
+  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
+  input_mem_types_[idx] = mem_type;
+  input_data_types_[idx] = dt;
+}
+
+MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
+  if (input_mem_types_.empty()) {
+    return output_mem_type_;
+  }
+  MACE_CHECK(idx < input_mem_types_.size(),
+             idx, " < ", input_mem_types_.size());
+  return input_mem_types_[idx];
+}
+
+DataType OpConstructContext::GetInputDataType(size_t idx) const {
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    return static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+  }
+  MACE_CHECK(idx < input_data_types_.size());
+  return input_data_types_[idx];
+}
+
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
    : ws_(ws), device_(device) {}


--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -35,9 +35,7 @@ class OpConstructContext {
  explicit OpConstructContext(Workspace *ws);
  ~OpConstructContext() = default;

-  inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
-    operator_def_ = operator_def;
-  }
+  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);

  inline std::shared_ptr<OperatorDef> operator_def() const {
    return operator_def_;
@@ -55,19 +53,26 @@ class OpConstructContext {
    return device_;
  }

-  inline void set_output_mem_type(MemoryType type) {
-    output_mem_type_ = type;
-  }
+  void set_output_mem_type(MemoryType type);

  inline MemoryType output_mem_type() const {
    return output_mem_type_;
  }

+  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
+
+  MemoryType GetInputMemType(size_t idx) const;
+
+  DataType GetInputDataType(size_t idx) const;
+
 private:
  std::shared_ptr<OperatorDef> operator_def_;
  Workspace *ws_;
  Device *device_;
-  MemoryType output_mem_type_;  // used for transform memory
+  // used for memory transform
+  std::vector<MemoryType> input_mem_types_;
+  std::vector<DataType> input_data_types_;
+  MemoryType output_mem_type_;  // there is only one output memory type now.
 };

 // memory_optimizer, device
@@ -93,6 +98,12 @@ class OpInitContext {
  Device *device_;
 };

+// Conventions
+// * If there exist format, NHWC is the default format
+// * The input/output format of CPU ops with float data type is NCHW
+// * The input/output format of GPU ops and CPU Quantization ops is NHWC
+// * Inputs' data type is same as the operation data type by default.
+// * The outputs' data type is same as the operation data type by default.
 class Operation {
 public:
  explicit Operation(OpConstructContext *context);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -612,11 +612,9 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                               const std::vector<int> &dilations,
                               const int wino_blk_size = 0) {
  testing::internal::LogToStderr();
-  srand(time(NULL));

  auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) {
    // generate random input
-    static unsigned int seed = time(NULL);
    index_t height = input_shape[0];
    index_t width = input_shape[1];
    index_t kernel_h = filter_shape[0];

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -375,10 +375,16 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
            context, operator_def_.get(), 2,
            OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
      }
-    } else if (operator_def_->input_size() >= 4) {
-      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
-                     == MaceStatus::MACE_SUCCESS);
+    } else {
+      if (operator_def_->input_size() >= 4) {
+        MACE_CHECK(TransformFilter<T>(
+            context,
+            operator_def_.get(),
+            3,
+            OpenCLBufferType::ARGUMENT,
+            mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+      context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
    }
  }
  MaceStatus Run(OpContext *context) override {

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -166,10 +166,15 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
      }
    }

-    for (auto output : op_def_.output()) {
-      ws_.RemoveTensor(output);
+    for (int i = 0; i < op_def_.output_size(); ++i) {
+      ws_.RemoveTensor(op_def_.output(i));
      auto output_info = net_def.add_output_info();
-      output_info->set_name(output);
+      output_info->set_name(op_def_.output(i));
+      if (op_def_.output_type_size() == op_def_.output_size()) {
+        output_info->set_data_type(op_def_.output_type(i));
+      } else {
+        output_info->set_data_type(DataType::DT_FLOAT);
+      }
    }
  }
  MemoryOptimizer mem_optimizer;

--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -21,6 +21,7 @@
 namespace mace {
 namespace ops {

+namespace {
 template <typename T, typename DstType>
 void ScalarEltwise(const T* in0,
                   const T* in1,
@@ -81,6 +82,7 @@ void ScalarEltwise(const T* in0,
      LOG(FATAL) << "Eltwise op not support type " << type;
  }
 }
+}  // namespace


 template <DeviceType D, typename T>
@@ -156,12 +158,6 @@ void RegisterScalarMath(OpRegistryBase *op_registry) {
                   DeviceType::CPU, float);
  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
                   DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
-                   DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -79,30 +79,6 @@ TEST_F(ScalarMathOpTest, SimpleCPU) {
      ops::EltwiseType::EQUAL, 3, 3, 1, 1);
 }

-TEST_F(ScalarMathOpTest, SimpleGPU) {
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SUM, 1, 2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SUB, 1, 2, 1, -1);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::PROD, 3, -2, 1, -6);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::DIV, 3, -2, 1, -1.5);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::MIN, 3, -2, 1, -2);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::MAX, 3, -2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::NEG, 3, -2, 1, -3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::ABS, 3, -2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::POW, 3, 1, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, int32_t>(
-      ops::EltwiseType::EQUAL, 3, 3, 1, 1);
-}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -21,11 +21,7 @@ template <DeviceType D, typename T>
 class ShapeOp : public Operation {
 public:
  explicit ShapeOp(OpConstructContext *context)
-      : Operation(context) {
-    if (D == DeviceType::GPU) {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
-    }
-  }
+      : Operation(context) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -66,12 +62,6 @@ class ShapeOp : public Operation {
 void RegisterShape(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
                   DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
@@ -25,11 +25,7 @@ class StackOp : public Operation {
 public:
  explicit StackOp(OpConstructContext *context)
      : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 0)) {
-    if (D == DeviceType::GPU) {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
-    }
-  }
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -54,6 +50,7 @@ class StackOp : public Operation {
    }

    // Output is on host, no need to map data
+    Tensor::MappingGuard output_guard(output);
    auto *output_data = output->mutable_data<T>();
    std::vector<const T *> input_data(inputs.size());
    for (size_t i = 0; i < inputs.size(); ++i) {
@@ -83,10 +80,6 @@ class StackOp : public Operation {
 void RegisterStack(OpRegistryBase *op_registry) {
  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -217,12 +217,6 @@ void RegisterStridedSlice(OpRegistryBase *op_registry) {
                   DeviceType::CPU, float);
  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
                   DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
-                   DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -741,7 +741,7 @@ def download_file(url, dst, num_retries=3):
        MaceLogger.info('\nDownloaded successfully.')
    except (urllib.error.ContentTooShortError, urllib.error.HTTPError,
            urllib.error.URLError) as e:
-        MaceLogger.warning('Download error:', e)
+        MaceLogger.warning('Download error:' + str(e))
        if num_retries > 0:
            return download_file(url, dst, num_retries - 1)
        else: