Merge branch 'master' into 'master'

Finish add2 opencl kernel and update tf converter to support icnet. See merge request !100

Merge branch 'master' into 'master'
Finish add2 opencl kernel and update tf converter to support icnet. See merge request !100
d4b8a5e0 · 刘琦 · 69f4b8b1 · 1f4d99c5 · d4b8a5e0 · d4b8a5e0
9 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -33,8 +33,8 @@ class OpenCLRuntime {

 private:
  cl::Context context_;
-  cl::CommandQueue command_queue_;
  cl::Device device_;
+  cl::CommandQueue command_queue_;
  cl::Program program_;
  std::once_flag build_flag_;
 };

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -10,22 +10,31 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct AddNFunctor {
-  void operator()(const vector<const T *> &inputs, T *output, index_t size) {
-    memset(output, 0, size * sizeof(T));
-    int n = inputs.size();
+  void operator()(std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    Tensor::MappingGuard output_map(output_tensor);
+    index_t size = input_tensors[0]->size();
+    T *output_ptr = output_tensor->mutable_data<T>();
+    memset(output_ptr, 0, size * sizeof(T));
+    int n = input_tensors.size();
    for (int i = 0; i < n; ++i) {
+      Tensor::MappingGuard input_map(input_tensors[i]);
+      const T *input_ptr = input_tensors[i]->data<T>();
      for (index_t j = 0; j < size; ++j) {
-        output[j] += inputs[i][j];
+        output_ptr[j] += input_ptr[j];
      }
    }
  }
 };

-template <>
+template<>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    const vector<const float *> &inputs, float *output, index_t size);
+    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+
+template<>
+void AddNFunctor<DeviceType::OPENCL, float>::operator()(
+    std::vector<const Tensor *> &inputs, Tensor *output);

 }  //  namespace kernels
 }  //  namespace mace

--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,10 +10,12 @@ namespace kernels {

 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    const vector<const float *> &inputs, float *output, index_t size) {
+    std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
  // TODO: neon mem copy
-  memset(output, 0, size * sizeof(float));
-  int n = inputs.size();
+  index_t size = output_tensor->size();
+  float *output_ptr = output_tensor->mutable_data<float>();
+  memset(output_ptr, 0, size * sizeof(float));
+  int n = input_tensors.size();
  int64_t cost = size * n;
  int64_t groups = 1;
  if (cost > kCostPerGroup) {
@@ -27,8 +29,9 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
    int nn = count >> 2;
    int remain = count - (nn << 2);
    for (int64_t j = 0; j < n; ++j) {
-      const float *inptr = inputs[j] + i;
-      float *outptr = output + i;
+      const float *input_base = input_tensors[j]->data<float>();
+      const float *inptr = input_base + i;
+      float *outptr = output_ptr + i;
      for (int k = 0; k < nn; ++k) {
        float32x4_t _inptr = vld1q_f32(inptr);
        float32x4_t _outptr = vld1q_f32(outptr);

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/addn.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+
+namespace mace {
+namespace kernels {
+
+static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
+  index_t element_size = input0->NumElements();
+  index_t blocks = (element_size + 3) / 4;
+
+  const uint32_t gws = blocks;
+
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+
+  auto addn_kernel = cl::Kernel(program, "add2");
+
+  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
+
+  uint32_t idx = 0;
+  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input0->buffer())));
+  addn_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input1->buffer())));
+  addn_kernel.setArg(idx++, static_cast<int32_t>(element_size));
+  addn_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      addn_kernel, cl::NullRange,
+      cl::NDRange(gws),
+      cl::NDRange(lws));
+  MACE_CHECK(error == CL_SUCCESS);
+}
+
+template<>
+void AddNFunctor<DeviceType::OPENCL, float>::operator()(std::vector<const Tensor *> &input_tensors,
+                                                        Tensor *output_tensor) {
+
+  if (input_tensors.empty() || input_tensors.front() == nullptr) {
+    return;
+  }
+  size_t size = input_tensors.size();
+
+  switch (size) {
+    case 2:Add2(input_tensors[0], input_tensors[1], output_tensor);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
+};
+
+}  // namespace kernels
+} //  namespace mace
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
+__kernel void add2(__global const float *input0,
+                   __global const float *input1,
+                   __private const int size,
+                   __global float *output) {
+  int idx = get_global_id(0);
+
+  if (idx + 4 > size) {
+    for(; idx < size; ++idx) {
+      *(output+idx) = *(input0+idx) + *(input1+idx);
+    }
+  } else {
+    float4 in_data0 = vload4(idx, input0);
+    float4 in_data1 = vload4(idx, input1);
+    vstore4(in_data0+in_data1, idx, output);
+  }
+}
+
--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
 REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON

+REGISTER_OPENCL_OPERATOR(AddN, AddNOp<DeviceType::OPENCL, float>);
+
 }  //  namespace mace
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -10,7 +10,7 @@

 namespace mace {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class AddNOp : public Operator<D, T> {
 public:
  AddNOp(const OperatorDef &operator_def, Workspace *ws)
@@ -19,16 +19,13 @@ class AddNOp : public Operator<D, T> {
  bool Run() override {
    Tensor *output_tensor = this->outputs_[0];
    output_tensor->ResizeLike(this->inputs_[0]);
-    T *output = output_tensor->mutable_data<T>();
-    index_t size = this->inputs_[0]->size();
    int n = this->inputs_.size();
-    vector<const T *> inputs(n);
+    vector<const Tensor *> inputs(n, nullptr);
    for (int i = 0; i < n; ++i) {
-      const Tensor *input_tensor = this->inputs_[i];
-      inputs[i] = input_tensor->data<T>();
+      inputs[i] = this->inputs_[i];
    }

-    functor_(inputs, output, size);
+    functor_(inputs, output_tensor);
    return true;
  }


--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -9,9 +9,44 @@ namespace mace {

 class AddnOpTest : public OpsTestBase {};

-TEST_F(AddnOpTest, AddnOp) {
+template<DeviceType D>
+void SimpleAdd2() {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
+  OpDefBuilder("AddN", "AddNTest")
+      .Input("Input1")
+      .Input("Input2")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+
+  // Run
+  net.RunOp(D);
+
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {2, 4, 6, 8, 10, 12});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+
+TEST_F(AddnOpTest, CPUSimpleAdd2) {
+  SimpleAdd2<DeviceType::CPU>();
+}
+
+TEST_F(AddnOpTest, NEONSimpleAdd2) {
+  SimpleAdd2<DeviceType::NEON>();
+}
+
+TEST_F(AddnOpTest, OPENCLSimpleAdd2) {
+  SimpleAdd2<DeviceType::OPENCL>();
+}
+
+template<DeviceType D>
+void SimpleAdd3() {
+  // Construct graph
+  OpsTestNet net;
  OpDefBuilder("AddN", "AddNTest")
      .Input("Input1")
      .Input("Input2")
@@ -20,20 +55,62 @@ TEST_F(AddnOpTest, AddnOp) {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input1", {1, 2, 3, 4});
-  net.AddRandomInput<DeviceType::CPU, float>("Input2", {1, 2, 3, 4});
-  net.AddRandomInput<DeviceType::CPU, float>("Input3", {1, 2, 3, 4});
+  net.AddInputFromArray<D, float>("Input1", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input2", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
+  net.AddInputFromArray<D, float>("Input3", {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});

  // Run
-  net.RunOp();
+  net.RunOp(D);
+
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {3, 6, 9, 12, 15, 18});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+
+TEST_F(AddnOpTest, CPUSimpleAdd3) {
+  SimpleAdd3<DeviceType::CPU>();
+}

-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
+TEST_F(AddnOpTest, NEONSimpleAdd3) {
+  SimpleAdd3<DeviceType::NEON>();
+}
+
+template<DeviceType D>
+void RandomTest() {
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("AddN", "AddNTest")
+      .Input("Input1")
+      .Input("Input2")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input1", {1, 2, 3, 4});
+  net.AddRandomInput<D, float>("Input2", {1, 2, 3, 4});

  // Check
-  net.RunOp(DeviceType::NEON);
+  net.RunOp(D);
+
+  Tensor result;
+  result.Copy(*net.GetOutput("Output"));
+
+  // Run
+  net.RunOp();
+
+  ExpectTensorNear<float>(*net.GetOutput("Output"), result, 1e-5);
+}
+
+TEST_F(AddnOpTest, CPURandom) {
+  RandomTest<DeviceType::CPU>();
+}
+
+TEST_F(AddnOpTest, NEONRandom) {
+  RandomTest<DeviceType::NEON>();
+}

-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
+TEST_F(AddnOpTest, OPENCLRandom) {
+  RandomTest<DeviceType::OPENCL>();
 }

 }  // namespace mace
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -12,6 +12,33 @@ pooling_type_mode = {
  'MaxPool': 2
 }

+def convert_tensor(op, tensor):
+  tf_tensor = op.outputs[0].eval()
+  tensor.name = op.outputs[0].name
+
+  shape = list(tf_tensor.shape)
+  if (op.name.find('pointwise_kernel') != -1 or
+          op.name.find('depthwise_kernel') != -1 or
+        op.name.endswith('weights') or
+        op.name.endswith('kernel')) \
+          and op.outputs[0].consumers()[0].type.find('Conv') != -1:
+    if op.outputs[0].consumers()[0].get_attr('data_format') == 'NCHW':
+      tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
+      shape = [shape[3], shape[2], shape[0], shape[1]]
+      # print (tensor.name, shape)
+  tensor.dims.extend(shape)
+
+  tf_dt = op.get_attr('dtype')
+  if tf_dt == tf.float32:
+    tensor.data_type = mace_pb2.DT_FLOAT
+    tensor.float_data.extend(tf_tensor.astype(float).flat)
+  elif tf_dt == tf.int32:
+    tensor.data_type = mace_pb2.DT_INT32
+    tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
+  else:
+    raise Exception("Not supported tensor type: " + tf_dt.name)
+
+
 def get_input_tensor(op, index):
  input_tensor = op.inputs[index]
  if input_tensor.op.type == 'Reshape':
@@ -24,26 +51,11 @@ def convert_ops(unresolved_ops, net_def):

  first_op = unresolved_ops[0]

-  if first_op.type == 'Placeholder' or first_op.type == 'Reshape':
+  if first_op.type in ['Placeholder', 'Reshape', 'Identity']:
    pass
  elif first_op.type == 'Const':
-    tf_tensor = first_op.outputs[0].eval()
    tensor = net_def.tensors.add()
-    tensor.name = first_op.outputs[0].name
-    # TODO: support other type than float
-    tensor.data_type = mace_pb2.DT_FLOAT
-
-    shape = list(tf_tensor.shape)
-    if (first_op.name.find('pointwise_kernel') != -1 or
-        first_op.name.find('depthwise_kernel') != -1 or
-        first_op.name.endswith('weights') or
-        first_op.name.endswith('kernel')) \
-        and first_op.outputs[0].consumers()[0].type.find('Conv') != -1:
-      tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
-      shape = [shape[3], shape[2], shape[0], shape[1]]
-      # print (tensor.name, shape)
-    tensor.dims.extend(shape)
-    tensor.float_data.extend(tf_tensor.astype(float).flat)
+    convert_tensor(first_op, tensor)
  elif first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative':
    op_def = net_def.op.add()
    op_def.name = first_op.name
@@ -61,9 +73,7 @@ def convert_ops(unresolved_ops, net_def):
    strides_arg.ints.extend(first_op.get_attr('strides')[2:])
    data_format_arg = op_def.arg.add()
    data_format_arg.name = 'data_format'
-    data_format_arg.s = first_op.get_attr('data_format')
-    if first_op.get_attr('data_format') != 'NCHW':
-      raise Exception('only support NCHW now')
+    data_format_arg.s = 'NCHW'

    if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
      bias_add_op = unresolved_ops[1]
@@ -78,7 +88,8 @@ def convert_ops(unresolved_ops, net_def):
    sub_op = unresolved_ops[5]
    add_1_op = unresolved_ops[6]
    # print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type)
-    if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add':
+    if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \
+                    mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add':
      raise Exception('Invalid BatchNorm Op')

    input_name = get_input_tensor(mul_1_op, 0).name
@@ -104,12 +115,6 @@ def convert_ops(unresolved_ops, net_def):
    max_limit_arg = op_def.arg.add()
    max_limit_arg.name = 'max_limit'
    max_limit_arg.f = 6
-  elif first_op.type == 'Relu':
-    op_def = net_def.op.add()
-    op_def.name = first_op.name
-    op_def.type = first_op.type
-    op_def.input.extend([input.name for input in first_op.inputs])
-    op_def.output.extend([output.name for output in first_op.outputs])
  elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool':
    op_def = net_def.op.add()
    op_def.name = first_op.name
@@ -130,9 +135,19 @@ def convert_ops(unresolved_ops, net_def):
    kernels_arg.ints.extend(first_op.get_attr('ksize')[2:])
    data_format_arg = op_def.arg.add()
    data_format_arg.name = 'data_format'
-    data_format_arg.s = first_op.get_attr('data_format')
-    if first_op.get_attr('data_format') != 'NCHW':
-      raise Exception('only support NCHW now')
+    data_format_arg.s = 'NCHW'
+  elif first_op.type == 'Add':
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = "AddN"
+    op_def.input.extend([input.name for input in first_op.inputs])
+    op_def.output.extend([output.name for output in first_op.outputs])
+  elif first_op.type in ['Relu', 'ResizeBilinear', 'SpaceToBatchND', 'BatchToSpaceND']:
+    op_def = net_def.op.add()
+    op_def.name = first_op.name
+    op_def.type = first_op.type
+    op_def.input.extend([input.name for input in first_op.inputs])
+    op_def.output.extend([output.name for output in first_op.outputs])
  else:
    raise Exception('Unknown Op: ' + first_op.name)
    pass
@@ -152,4 +167,6 @@ def convert_to_mace_pb(input_graph_def):
      while len(unresolved_ops) > 0:
        convert_ops(unresolved_ops, net_def)

+  print "Done."
+
  return net_def