Quantize matmul only, add gather u8 test.

d2c2897a · liyin · 0c3cc381 · d2c2897a · d2c2897a · d2c2897a
7 changed file
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -926,7 +926,9 @@ class EltwiseOp : public Operation {
                       const Tensor *input1,
                       Tensor *output) {
    bool swapped = false;
-    if (input0->size() < input1->size()) {
+    if (input0->dim_size() < input1->dim_size()
+        || (input0->dim_size() == input1->dim_size()
+            && input0->size() < input1->size())) {
      std::swap(input0, input1);
      swapped = true;
    }

--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -23,53 +23,67 @@ namespace test {
 class GatherOpTest : public OpsTestBase {};

 namespace {
+template<typename T>
 void TestGather(const std::vector<index_t> &weight_shape,
-                const std::vector<float> &weight,
+                const std::vector<T> &weight,
                const std::vector<index_t> &input_shape,
                const std::vector<int32_t> &input,
                const int axis,
                const std::vector<index_t> &output_shape,
-                const std::vector<float> &output) {
+                const std::vector<T> &output) {
  OpsTestNet net;

-  net.AddInputFromArray<CPU, float>("Params", weight_shape, weight);
+  net.AddInputFromArray<CPU, T>("Params", weight_shape, weight);
  net.AddInputFromArray<CPU, int32_t>("Indices", input_shape, input);

  OpDefBuilder("Gather", "GatherTest")
      .Input("Params")
      .Input("Indices")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
      .AddIntArg("axis", axis)
      .Output("Output")
      .Finalize(net.NewOperatorDef());
  // Run
  net.RunOp(CPU);

-  auto expected = net.CreateTensor<float>(output_shape, output);
+  auto expected = net.CreateTensor<T>(output_shape, output);

-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 }  // namespace

 TEST_F(GatherOpTest, CPUScalarIndex) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {}, {5}, 0, {2}, {10, 11});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
             {}, {5}, 0, {2}, {10, 11});
 }

 TEST_F(GatherOpTest, CPURank1Index) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
             {3}, {2, 4, 6}, 0, {3, 2}, {4, 5, 8, 9, 12, 13});
 }

 TEST_F(GatherOpTest, CPURank1IndexWithAxis1) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
             {1}, {1}, 1, {10, 1}, {1, 3, 5, 7, 9, 11, 13, 15, 17, 19});
 }

 TEST_F(GatherOpTest, CPURankHighIndex) {
-  TestGather({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+  TestGather<float>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
+             {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13});
+  TestGather<uint8_t>({10, 2}, {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                       10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
             {1, 3}, {2, 4, 6}, 0, {1, 3, 2}, {4, 5, 8, 9, 12, 13});
 }

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -233,8 +233,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                  const index_t height,
                  const index_t K,
                  const index_t width,
-                  const bool lhs_bached,
-                  const bool rhs_bached,
+                  const bool lhs_batched,
+                  const bool rhs_batched,
                  Tensor *C) {
 #if defined(MACE_ENABLE_NEON)
    if (width == 1 && AOrder == gemmlowp::MapOrder::RowMajor) {
@@ -245,8 +245,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                           batch,
                           height,
                           K,
-                           true,
-                           true,
+                           lhs_batched,
+                           rhs_batched,
                           C);
    } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
      gemv_kernel_.Compute(context,
@@ -256,8 +256,8 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {
                           batch,
                           width,
                           K,
-                           true,
-                           true,
+                           lhs_batched,
+                           rhs_batched,
                           C);
    } else {
 #endif  // MACE_ENABLE_NEON
@@ -281,11 +281,13 @@ class MatMulFixpointImpl<AOrder, BOrder, uint8_t> {

      for (index_t i = 0; i < batch; ++i) {
        gemmlowp::MatrixMap<const uint8_t, AOrder>
-            a_matrix(a_ptr_base + static_cast<index_t>(lhs_bached) * i * a_size,
+            a_matrix(a_ptr_base
+                         + static_cast<index_t>(lhs_batched) * i * a_size,
                     height,
                     K);
        gemmlowp::MatrixMap<const uint8_t, BOrder>
-            b_matrix(b_ptr_base + static_cast<index_t>(rhs_bached) * i * b_size,
+            b_matrix(b_ptr_base
+                         + static_cast<index_t>(rhs_batched) * i * b_size,
                     K,
                     width);
        gemmlowp::MatrixMap <uint8_t, gemmlowp::MapOrder::RowMajor>
@@ -315,8 +317,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                  const index_t height,
                  const index_t K,
                  const index_t width,
-                  const bool lhs_bached,
-                  const bool rhs_bached,
+                  const bool lhs_batched,
+                  const bool rhs_batched,
                  Tensor *C) {
    C->SetScale(A->scale() * B->scale());
    C->SetZeroPoint(0);
@@ -330,8 +332,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                           batch,
                           height,
                           K,
-                           lhs_bached,
-                           rhs_bached,
+                           lhs_batched,
+                           rhs_batched,
                           C);
    } else if (height == 1 && BOrder == gemmlowp::MapOrder::ColMajor) {
      gemv_kernel_.Compute(context,
@@ -341,8 +343,8 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
                           batch,
                           width,
                           K,
-                           lhs_bached,
-                           rhs_bached,
+                           lhs_batched,
+                           rhs_batched,
                           C);
    } else {
 #endif  // MACE_ENABLE_NEON
@@ -366,12 +368,12 @@ class MatMulFixpointImpl<AOrder, BOrder, int32_t> {
      for (index_t i = 0; i < batch; ++i) {
        gemmlowp::MatrixMap<const uint8_t, AOrder>
            a_matrix
-            (a_ptr_base + static_cast<index_t>(lhs_bached) * i * a_size,
+            (a_ptr_base + static_cast<index_t>(lhs_batched) * i * a_size,
             height,
             K);
        gemmlowp::MatrixMap<const uint8_t, BOrder>
            b_matrix
-            (b_ptr_base + static_cast<index_t>(rhs_bached) * i * b_size,
+            (b_ptr_base + static_cast<index_t>(rhs_batched) * i * b_size,
             K,
             width);
        gemmlowp::MatrixMap <int32_t, gemmlowp::MapOrder::RowMajor>

--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -135,7 +135,8 @@ void Complex(const std::vector<index_t> &batch,
               rhs_batched,
               &expected_output_tensor);

-  ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"));
+  ExpectTensorNear<float>(expected_output_tensor, *net.GetTensor("Output"),
+      1e-4, 1e-2);
 }
 }  // namespace


--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -236,6 +236,7 @@ class MaceKeyword(object):
    mace_step_h_str = 'step_h'
    mace_step_w_str = 'step_w'
    mace_find_range_every_time = 'find_range_every_time'
+    mace_non_zero = 'non_zero'
    mace_pad_type_str = 'pad_type'


@@ -279,6 +280,7 @@ class TransformerRule(Enum):
    FOLD_FC_RESHAPE = 37
    TRANSFORM_CHANNEL_SHUFFLE = 38
    UPDATE_DATA_FORMAT = 39
+    QUANTIZE_MATMUL_ONLY = 40


 class ConverterInterface(object):

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+import os
 import math
 import numpy as np
 import six
@@ -288,12 +288,11 @@ class TensorflowConverter(base_converter.ConverterInterface):
            tf_graph_def.ParseFromString(f.read())

        self._placeholders = {}
-        self.add_shape_info(tf_graph_def)

        print("Run transform_graph: %s" % TFTransformGraphOptions[
            option.device])
        try:
-            print ("output keys: ", option.output_nodes.keys())
+            print("output keys: ", option.output_nodes.keys())
            transformed_graph_def = TransformGraph(tf_graph_def,
                                                   option.input_nodes.keys(),
                                                   option.output_nodes.keys(),
@@ -303,6 +302,16 @@ class TensorflowConverter(base_converter.ConverterInterface):
            print("Failed to transform graph using tf tool: %s" % ex)
            transformed_graph_def = tf_graph_def

+        # To check optimized model, uncomment following code.
+        # tf.io.write_graph(
+        #     transformed_graph_def,
+        #     ".",
+        #     os.path.basename(src_model_file)[:-3] + "_opt.pb",
+        #     as_text=False
+        # )
+
+        self.add_shape_info(transformed_graph_def)
+
        with tf.Session() as session:
            with session.graph.as_default() as graph:
                tf.import_graph_def(transformed_graph_def, name='')

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -103,6 +103,8 @@ class Transformer(base_converter.ConverterInterface):
                self.transform_caffe_reshape_and_flatten,
            TransformerRule.TRANSFORM_CHANNEL_SHUFFLE:
                self.transform_channel_shuffle,
+            TransformerRule.QUANTIZE_MATMUL_ONLY:
+                self.quantize_matmul_only,
        }

        self._option = option
@@ -191,16 +193,23 @@ class Transformer(base_converter.ConverterInterface):
                    op = mace_pb2.OperatorDef()
                    op.name = self.normalize_op_name(input_node.name)
                    op.type = "Input"
+                    data_type_arg = op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_op_data_type_str
+                    data_type_arg.i = mace_pb2.DT_FLOAT
                    op.output.extend([input_node.name])
                    output_shape = op.output_shape.add()
                    output_shape.dims.extend(input_node.shape)
-                    if ConverterUtil.data_format(
-                            self._consumers[input_node.name][0]) \
-                            == DataFormat.NCHW:
-                        self.transpose_shape(output_shape.dims, [0, 3, 1, 2])
-                        ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)
-                    else:
-                        ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)
+                    if input_node in self._consumers:
+                        if ConverterUtil.data_format(
+                                self._consumers[input_node.name][0]) \
+                                == DataFormat.NCHW:
+                            self.transpose_shape(output_shape.dims,
+                                                 [0, 3, 1, 2])
+                            ConverterUtil.add_data_format_arg(op,
+                                                              DataFormat.NCHW)
+                        else:
+                            ConverterUtil.add_data_format_arg(op,
+                                                              DataFormat.NHWC)
                    self._producer[op.output[0]] = op

    @staticmethod
@@ -221,10 +230,32 @@ class Transformer(base_converter.ConverterInterface):
        return name.replace(':', '_')

    def get_tensor_shape(self, tensor):
-        producer = self._producer[tensor]
-        for i in six.moves.range(len(producer.output)):
-            if producer.output[i] == tensor:
-                return list(producer.output_shape[i].dims)
+        if tensor in self._consts:
+            return list(self._consts[tensor].dims)
+        elif tensor in self._producer:
+            producer = self._producer[tensor]
+            for i in six.moves.range(len(producer.output)):
+                if producer.output[i] == tensor:
+                    return list(producer.output_shape[i].dims)
+        else:
+            return None
+
+    def get_tensor_data_type(self, tensor):
+        if tensor in self._consts:
+            return self._consts[tensor].data_type
+        elif tensor in self._producer:
+            producer = self._producer[tensor]
+            for i in six.moves.range(len(producer.output)):
+                if producer.output[i] == tensor:
+                    if i < len(producer.output_type):
+                        return producer.output_type[i]
+                    elif ConverterUtil.get_arg(producer, "T") is not None:
+                        return ConverterUtil.get_arg(producer, "T").i
+                    else:
+                        print("No data type filled: ", producer)
+                        return None
+        else:
+            return None

    def consumer_count(self, tensor_name):
        return len(self._consumers.get(tensor_name, []))
@@ -1374,6 +1405,7 @@ class Transformer(base_converter.ConverterInterface):
        return False

    def update_data_format(self):
+        print("update data format")
        data_format_flag = DataFormat.NHWC.value
        for input_node in self._option.input_nodes.values():
            if input_node.data_format.value == DataFormat.DF_NONE.value:
@@ -1672,7 +1704,8 @@ class Transformer(base_converter.ConverterInterface):
                    quantize_util.adjust_range(input_node.range[0],
                                               input_node.range[1],
                                               non_zero=False)
-                quantize_info = mace_pb2.QuantizeActivationInfo()
+                quantize_info = \
+                    mace_pb2.QuantizeActivationInfo()
                quantize_info.minval = minval
                quantize_info.maxval = maxval
                quantize_info.scale = scale
@@ -1893,3 +1926,111 @@ class Transformer(base_converter.ConverterInterface):
                            producer_op.output_shape[0].dims[:] = output_shape

                    return True
+
+    def quantize_matmul_only(self):
+        """
+        This transform rule is only used internally, we are not gonna make
+        things too complex for users
+        """
+        to_quantize_ops = [MaceOp.MatMul.name]
+        for op in self._model.op:
+            if (op.type not in to_quantize_ops or len(op.output) > 1
+                    or ConverterUtil.get_arg(op,
+                                             MaceKeyword.mace_op_data_type_str).i != mace_pb2.DT_FLOAT):  # noqa
+                # only support single output
+                continue
+
+            quantized_inputs_names = []
+            should_quantize = True
+            for idx, input_tensor in enumerate(op.input):
+                if self.get_tensor_data_type(input_tensor) \
+                        != mace_pb2.DT_FLOAT:
+                    should_quantize = False
+                    break
+
+            if not should_quantize:
+                continue
+
+            non_zero = self._option.device == DeviceType.CPU.value
+
+            for idx, input_tensor in enumerate(op.input):
+                quantized_inputs_names.append(input_tensor)
+
+                if input_tensor in self._consts:
+                    const_tensor = self._consts[input_tensor]
+                    quantized_tensor = quantize_util.quantize(
+                        const_tensor.float_data, non_zero)
+                    del const_tensor.float_data[:]
+                    const_tensor.int32_data.extend(quantized_tensor.data)
+                    const_tensor.data_type = mace_pb2.DT_UINT8
+                    const_tensor.scale = quantized_tensor.scale
+                    const_tensor.zero_point = quantized_tensor.zero
+                    const_tensor.minval = quantized_tensor.minval
+                    const_tensor.maxval = quantized_tensor.maxval
+                    const_tensor.quantized = True
+                else:
+                    input_shape = self.get_tensor_shape(input_tensor)
+                    quantize_op = self._model.op.add()
+                    quantize_op.name = self.normalize_op_name(
+                        input_tensor) + "_quant"
+                    quantize_op.type = MaceOp.Quantize.name
+                    quantize_op.input.extend([input_tensor])
+                    quantize_output_name = quantize_op.name + '_0'
+                    quantize_op.output.extend([quantize_output_name])
+                    output_shape = quantize_op.output_shape.add()
+                    output_shape.dims.extend(input_shape)
+                    quantize_op.output_type.extend([mace_pb2.DT_UINT8])
+                    data_type_arg = quantize_op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_op_data_type_str
+                    data_type_arg.i = mace_pb2.DT_UINT8
+
+                    data_type_arg = quantize_op.arg.add()
+                    data_type_arg.name = MaceKeyword.mace_non_zero
+                    if non_zero:
+                        data_type_arg.i = 1
+                    else:
+                        data_type_arg.i = 0
+
+                    find_range_arg = quantize_op.arg.add()
+                    find_range_arg.name = \
+                        MaceKeyword.mace_find_range_every_time
+                    find_range_arg.i = 1
+
+                    quantized_inputs_names[-1] = quantize_output_name
+
+                non_zero = False
+
+            del op.input[:]
+            op.input.extend(quantized_inputs_names)
+
+            orginal_output_name = op.output[0]
+            op.output[0] = orginal_output_name + "_quant"
+            op.output_type.extend([mace_pb2.DT_INT32])
+            data_type_arg = ConverterUtil.get_arg(op,
+                                                  MaceKeyword.mace_op_data_type_str)  # noqa
+            if data_type_arg is None:
+                data_type_arg = op.arg.add()
+                data_type_arg.name = MaceKeyword.mace_op_data_type_str
+            data_type_arg.i = mace_pb2.DT_UINT8
+
+            dequantize_op = self._model.op.add()
+            dequantize_op.name = op.name + "_dequant"
+            dequantize_op.type = MaceOp.Dequantize.name
+            dequantize_op.input.extend([op.output[0]])
+            dequantize_op.output.extend([orginal_output_name])
+            dequantize_op.output_shape.extend(op.output_shape)
+            dequantize_op.output_type.extend([mace_pb2.DT_FLOAT])
+            data_type_arg = dequantize_op.arg.add()
+            data_type_arg.name = MaceKeyword.mace_op_data_type_str
+            data_type_arg.i = mace_pb2.DT_INT32
+
+            quantize_flag_arg = ConverterUtil.get_arg(self._model,
+                                                      MaceKeyword.mace_quantize_flag_arg_str)  # noqa
+            if quantize_flag_arg is None:
+                quantize_flag_arg = self._model.arg.add()
+                quantize_flag_arg.name = MaceKeyword.mace_quantize_flag_arg_str
+                quantize_flag_arg.i = 1
+
+            return True
+
+        return False