Support build once then run on cpu or gpu.

1617b83f · liuqi · 9716a876 · 1617b83f · 1617b83f · 1617b83f
14 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -40,13 +40,19 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
-    VLOG(3) << "Creating operator " << operator_def.name() << "("
-            << operator_def.type() << ")";
-    OperatorDef temp_def(operator_def);
-    std::unique_ptr<OperatorBase> op(
-        op_registry->CreateOperator(temp_def, ws, type, mode));
-    if (op) {
-      operators_.emplace_back(std::move(op));
+    // TODO(liuqi): refactor based on PB
+    const int op_device =
+        ArgumentHelper::GetSingleArgument<OperatorDef, int>(
+            operator_def, "device", -1);
+    if (op_device == type) {
+      VLOG(3) << "Creating operator " << operator_def.name() << "("
+              << operator_def.type() << ")";
+      OperatorDef temp_def(operator_def);
+      std::unique_ptr<OperatorBase> op(
+          op_registry->CreateOperator(temp_def, ws, type, mode));
+      if (op) {
+        operators_.emplace_back(std::move(op));
+      }
    }
  }
 }

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -136,7 +136,11 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  // As DSP may have different data output type for each op,
  // we stick to the same concept.
  for (auto &op : net_def.op()) {
-    if (!op.mem_id().empty()) {
+    // TODO(liuqi): refactor based on PB
+    const int op_device =
+        ArgumentHelper::GetSingleArgument<OperatorDef, int>(
+            op, "device", -1);
+    if (op_device == device_type && !op.mem_id().empty()) {
      const DataType op_dtype = static_cast<DataType>(
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(
              op, "T", static_cast<int>(DT_FLOAT)));
@@ -150,20 +154,29 @@ void Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
  for (auto &mem_block : net_def.mem_arena().mem_block()) {
    if (device_type == DeviceType::GPU) {
-      std::unique_ptr<BufferBase> image_buf(
-          new Image({mem_block.x(), mem_block.y()}, dtype));
-      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                        std::move(image_buf));
+      // TODO(liuqi): refactor based on PB
+      if (mem_block.mem_id() >= 20000) {
+        std::unique_ptr<BufferBase> image_buf(
+            new Image({mem_block.x(), mem_block.y()}, dtype));
+        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                          std::move(image_buf));
+      }
    } else {
-      std::unique_ptr<BufferBase> tensor_buf(
-          new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
-      preallocated_allocator_.SetBuffer(mem_block.mem_id(),
-                                        std::move(tensor_buf));
+      if (mem_block.mem_id() < 20000) {
+        std::unique_ptr<BufferBase> tensor_buf(
+            new Buffer(GetDeviceAllocator(device_type), mem_block.x()));
+        preallocated_allocator_.SetBuffer(mem_block.mem_id(),
+                                          std::move(tensor_buf));
+      }
    }
  }
  VLOG(3) << "Preallocate buffer to tensors";
  for (auto &op : net_def.op()) {
-    if (!op.mem_id().empty()) {
+    // TODO(liuqi): refactor based on PB
+    const int op_device =
+        ArgumentHelper::GetSingleArgument<OperatorDef, int>(
+            op, "device", -1);
+    if (op_device == device_type && !op.mem_id().empty()) {
      auto mem_ids = op.mem_id();
      int count = mem_ids.size();
      for (int i = 0; i < count; ++i) {

--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -16,6 +16,7 @@ import argparse
 import sys
 import hashlib
 import os.path
+import copy

 from mace.proto import mace_pb2
 from mace.python.tools import tf_dsp_converter_lib
@@ -25,6 +26,7 @@ from mace.python.tools.converter_tool import base_converter as cvt
 from mace.python.tools.converter_tool import tensorflow_converter
 from mace.python.tools.converter_tool import caffe_converter
 from mace.python.tools.converter_tool import transformer
+from mace.python.tools.convert_util import mace_check


 # ./bazel-bin/mace/python/tools/tf_converter --model_file quantized_test.pb \
@@ -34,11 +36,14 @@ from mace.python.tools.converter_tool import transformer

 FLAGS = None

-data_type_map = {'DT_HALF': mace_pb2.DT_HALF,
-                 'DT_FLOAT': mace_pb2.DT_FLOAT}
 device_type_map = {'cpu': mace_pb2.CPU,
                   'gpu': mace_pb2.GPU,
                   'dsp': mace_pb2.HEXAGON}
+device_data_type_map = {
+    mace_pb2.CPU: mace_pb2.DT_FLOAT,
+    mace_pb2.GPU: mace_pb2.DT_HALF,
+    mace_pb2.HEXAGON: mace_pb2.DT_UINT8
+}


 def file_checksum(fname):
@@ -81,7 +86,7 @@ def main(unused_args):
    if FLAGS.platform not in ['tensorflow', 'caffe']:
        print ("platform %s is not supported." % FLAGS.platform)
        sys.exit(-1)
-    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp']:
+    if FLAGS.runtime not in ['cpu', 'gpu', 'dsp', '']:
        print ("runtime %s is not supported." % FLAGS.runtime)
        sys.exit(-1)

@@ -95,8 +100,6 @@ def main(unused_args):
            sys.exit(-1)
    else:
        option = cvt.ConverterOption()
-        option.data_type = data_type_map[FLAGS.data_type]
-        option.device = device_type_map[FLAGS.runtime]
        option.winograd_enabled = bool(FLAGS.winograd)

        input_node_names = FLAGS.input_node.split(',')
@@ -117,8 +120,8 @@ def main(unused_args):

        print("Convert model to mace model.")
        if FLAGS.platform == 'tensorflow':
-            converter = tensorflow_converter.TensorflowConverter(option,
-                                                                 FLAGS.model_file)  # noqa
+            converter = tensorflow_converter.TensorflowConverter(
+                option, FLAGS.model_file)
        elif FLAGS.platform == 'caffe':
            converter = caffe_converter.CaffeConverter(option,
                                                       FLAGS.model_file,
@@ -126,16 +129,49 @@ def main(unused_args):

        output_graph_def = converter.run()
        print("Transform model to one that can better run on device.")
-        # TODO(liuqi/liyin): transform gpu/cpu and merge their ops
-        mace_transformer = transformer.Transformer(option, output_graph_def)
-        output_graph_def = mace_transformer.run()
+        if not FLAGS.runtime:
+            cpu_graph_def = copy.deepcopy(output_graph_def)
+            option.device = mace_pb2.CPU
+            option.data_type = device_data_type_map[mace_pb2.CPU]
+            option.disable_transpose_filters()
+            mace_cpu_transformer = transformer.Transformer(
+                option, cpu_graph_def)
+            cpu_graph_def = mace_cpu_transformer.run()
+            print "start optimize cpu memory."
+            memory_optimizer.optimize_cpu_memory(cpu_graph_def)
+            print "CPU memory optimization done."

-        print "start optimize memory."
-        if FLAGS.runtime == 'gpu':
-            memory_optimizer.optimize_gpu_memory(output_graph_def)
-        elif FLAGS.runtime == 'cpu':
-            memory_optimizer.optimize_cpu_memory(output_graph_def)
-        print "Memory optimization done."
+            option.device = mace_pb2.GPU
+            option.data_type = device_data_type_map[mace_pb2.GPU]
+            option.enable_transpose_filters()
+            mace_gpu_transformer = transformer.Transformer(
+                option, output_graph_def)
+            output_gpu_graph_def = mace_gpu_transformer.run()
+            print "start optimize gpu memory."
+            memory_optimizer.optimize_gpu_memory(output_gpu_graph_def)
+            print "GPU memory optimization done."
+
+            print "Merge cpu and gpu ops together"
+            output_graph_def.op.extend(cpu_graph_def.op)
+            output_graph_def.mem_arena.mem_block.extend(
+                cpu_graph_def.mem_arena.mem_block)
+            print "Merge done"
+        else:
+            option.device = device_type_map[FLAGS.runtime]
+            option.data_type = device_data_type_map[option.device]
+            mace_transformer = transformer.Transformer(
+                option, output_graph_def)
+            output_graph_def = mace_transformer.run()
+
+            print "start optimize memory."
+            if FLAGS.runtime == 'gpu':
+                memory_optimizer.optimize_gpu_memory(output_graph_def)
+            elif FLAGS.runtime == 'cpu':
+                memory_optimizer.optimize_cpu_memory(output_graph_def)
+            else:
+                mace_check(False, "runtime only support [gpu|cpu|dsp]")
+
+            print "Memory optimization done."

    if FLAGS.output_type == 'source':
        source_converter_lib.convert_to_source(
@@ -188,7 +224,7 @@ def parse_args():
        default="",
        help="File to save the output graph to.")
    parser.add_argument(
-        "--runtime", type=str, default="cpu", help="Runtime: cpu/gpu/dsp")
+        "--runtime", type=str, default="", help="Runtime: cpu/gpu/dsp")
    parser.add_argument(
        "--input_node",
        type=str,
@@ -196,11 +232,6 @@ def parse_args():
        help="e.g., input_node")
    parser.add_argument(
        "--output_node", type=str, default="softmax", help="e.g., softmax")
-    parser.add_argument(
-        "--data_type",
-        type=str,
-        default='DT_FLOAT',
-        help="e.g., DT_HALF/DT_FLOAT")
    parser.add_argument(
        "--output_type", type=str, default="pb", help="output type: source/pb")
    parser.add_argument(

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 from enum import Enum

 from mace.proto import mace_pb2
@@ -117,6 +132,27 @@ class MaceKeyword(object):
    mace_axis_str = 'axis'
    mace_shape_str = 'shape'
    mace_winograd_filter_transformed = 'is_filter_transformed'
+    mace_device = 'device'
+
+
+class TransformerRule(Enum):
+    REMOVE_IDENTITY_OP = 0
+    TRANSFORM_GLOBAL_POOLING = 1
+    FOLD_SOFTMAX = 2
+    FOLD_BATCHNORM = 3,
+    FOLD_CONV_AND_BN = 4,
+    FOLD_DEPTHWISE_CONV_AND_BN = 5,
+    TRANSFORM_GPU_WINOGRAD = 6,
+    TRANSFORM_ADD_TO_BIASADD = 7,
+    FOLD_BIASADD = 8,
+    FOLD_ACTIVATION = 9,
+    TRANSPOSE_FILTERS = 10,
+    RESHAPE_FC_WEIGHT = 11,
+    TRANSPOSE_DATA_FORMAT = 12,
+    TRANSFORM_GLOBAL_CONV_TO_FC = 13,
+    TRANSFORM_BUFFER_IMAGE = 14,
+    ADD_DEVICE_AND_DATA_TYPE = 15,
+    SORT_BY_EXECUTION = 16


 class ConverterInterface(object):
@@ -162,6 +198,25 @@ class ConverterOption(object):
        self._data_type = mace_pb2.DT_FLOAT
        self._device = mace_pb2.CPU
        self._winograd_enabled = False
+        self._transformer_option = [
+            TransformerRule.REMOVE_IDENTITY_OP,
+            TransformerRule.TRANSFORM_GLOBAL_POOLING,
+            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_BATCHNORM,
+            TransformerRule.FOLD_CONV_AND_BN,
+            TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
+            TransformerRule.TRANSFORM_GPU_WINOGRAD,
+            TransformerRule.TRANSFORM_ADD_TO_BIASADD,
+            TransformerRule.FOLD_BIASADD,
+            TransformerRule.FOLD_ACTIVATION,
+            TransformerRule.TRANSPOSE_FILTERS,
+            TransformerRule.RESHAPE_FC_WEIGHT,
+            TransformerRule.TRANSPOSE_DATA_FORMAT,
+            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC,
+            TransformerRule.TRANSFORM_BUFFER_IMAGE,
+            TransformerRule.ADD_DEVICE_AND_DATA_TYPE,
+            TransformerRule.SORT_BY_EXECUTION,
+        ]

    @property
    def input_nodes(self):
@@ -183,6 +238,10 @@ class ConverterOption(object):
    def winograd_enabled(self):
        return self._winograd_enabled

+    @property
+    def transformer_option(self):
+        return self._transformer_option
+
    @input_nodes.setter
    def input_nodes(self, input_nodes):
        for node in input_nodes:
@@ -211,6 +270,14 @@ class ConverterOption(object):
    def winograd_enabled(self, winograd_enabled):
        self._winograd_enabled = winograd_enabled

+    def disable_transpose_filters(self):
+        if TransformerRule.TRANSPOSE_FILTERS in self._transformer_option:
+            self._transformer_option.remove(TransformerRule.TRANSPOSE_FILTERS)
+
+    def enable_transpose_filters(self):
+        if TransformerRule.TRANSPOSE_FILTERS not in self._transformer_option:
+            self._transformer_option.append(TransformerRule.TRANSPOSE_FILTERS)
+

 class ConverterUtil(object):
    @staticmethod

--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import math
 import numpy as np
 import google.protobuf.text_format
@@ -325,10 +340,6 @@ class CaffeConverter(base_converter.ConverterInterface):
        op.input.extend(caffe_op.layer.bottom)
        op.output.extend(caffe_op.layer.top)

-        data_type_arg = op.arg.add()
-        data_type_arg.name = 'T'
-        data_type_arg.i = self._option.data_type
-
        ConverterUtil.add_data_format_arg(op, DataFormat.NCHW)

        return op

--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import math
 import numpy as np


--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import math
 import numpy as np
 import tensorflow as tf
@@ -197,11 +212,6 @@ class TensorflowConverter(base_converter.ConverterInterface):
        for tf_output in tf_op.outputs:
            output_shape = op.output_shape.add()
            output_shape.dims.extend(tf_output.shape.as_list())
-            op.output_type.append(self._option.data_type)
-
-        data_type_arg = op.arg.add()
-        data_type_arg.name = 'T'
-        data_type_arg.i = self._option.data_type

        ConverterUtil.add_data_format_arg(op, DataFormat.NHWC)

@@ -289,7 +299,6 @@ class TensorflowConverter(base_converter.ConverterInterface):
        op.input.extend([scale_name, offset_name])
        del op.output[1:]
        del op.output_shape[1:]
-        del op.output_type[1:]

    def convert_pooling(self, tf_op):
        op = self.convert_general_op(tf_op)

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
+# Copyright 2018 Xiaomi, Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import enum
 import numpy as np

@@ -11,6 +26,7 @@ from mace.python.tools.converter_tool.base_converter import FilterFormat
 from mace.python.tools.converter_tool.base_converter import MaceOp
 from mace.python.tools.converter_tool.base_converter import MaceKeyword
 from mace.python.tools.converter_tool.base_converter import ConverterUtil
+from mace.python.tools.converter_tool.base_converter import TransformerRule
 from mace.python.tools.convert_util import mace_check

 OPENCL_IMAGE_MAX_SIZE = 16384
@@ -36,23 +52,52 @@ class Transformer(base_converter.ConverterInterface):

    def __init__(self, option, model):
        # DO NOT reorder the following transformers
-        self._registered_transformers = [
-            self.remove_identity_op,
-            self.transform_global_pooling,
-            self.fold_softmax,
-            self.fold_batchnorm,
-            self.fold_conv_and_bn,  # data_format related
-            self.fold_depthwise_conv_and_bn,  # data_format related
-            self.transform_gpu_winograd,  # data_format related
-            self.transform_add_to_biasadd,
-            self.fold_biasadd,
-            self.fold_activation,
-            self.transpose_filters,
-            self.transpose_data_format,
-            self.transform_global_conv_to_fc,
-            self.transform_buffer_image,
-            self.sort_by_execution,
+        self._registered_transformers_order = [
+            TransformerRule.REMOVE_IDENTITY_OP,
+            TransformerRule.TRANSFORM_GLOBAL_POOLING,
+            TransformerRule.FOLD_SOFTMAX,
+            TransformerRule.FOLD_BATCHNORM,
+            TransformerRule.FOLD_CONV_AND_BN,
+            TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN,
+            TransformerRule.TRANSFORM_GPU_WINOGRAD,
+            TransformerRule.TRANSFORM_ADD_TO_BIASADD,
+            TransformerRule.FOLD_BIASADD,
+            TransformerRule.FOLD_ACTIVATION,
+            TransformerRule.TRANSPOSE_FILTERS,
+            TransformerRule.RESHAPE_FC_WEIGHT,
+            TransformerRule.TRANSPOSE_DATA_FORMAT,
+            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC,
+            TransformerRule.TRANSFORM_BUFFER_IMAGE,
+            TransformerRule.ADD_DEVICE_AND_DATA_TYPE,
+            TransformerRule.SORT_BY_EXECUTION,
        ]
+        self._registered_transformers = {
+            TransformerRule.REMOVE_IDENTITY_OP: self.remove_identity_op,
+            TransformerRule.TRANSFORM_GLOBAL_POOLING:
+                self.transform_global_pooling,
+            TransformerRule.FOLD_SOFTMAX: self.fold_softmax,
+            TransformerRule.FOLD_BATCHNORM: self.fold_batchnorm,
+            TransformerRule.FOLD_CONV_AND_BN:
+                self.fold_conv_and_bn,  # data_format related
+            TransformerRule.FOLD_DEPTHWISE_CONV_AND_BN:
+                self.fold_depthwise_conv_and_bn,  # data_format related
+            TransformerRule.TRANSFORM_GPU_WINOGRAD:
+                self.transform_gpu_winograd,  # data_format related
+            TransformerRule.TRANSFORM_ADD_TO_BIASADD:
+                self.transform_add_to_biasadd,
+            TransformerRule.FOLD_BIASADD: self.fold_biasadd,
+            TransformerRule.FOLD_ACTIVATION: self.fold_activation,
+            TransformerRule.TRANSPOSE_FILTERS: self.transpose_filters,
+            TransformerRule.RESHAPE_FC_WEIGHT: self.reshape_fc_weight,
+            TransformerRule.TRANSPOSE_DATA_FORMAT: self.transpose_data_format,
+            TransformerRule.TRANSFORM_GLOBAL_CONV_TO_FC:
+                self.transform_global_conv_to_fc,
+            TransformerRule.TRANSFORM_BUFFER_IMAGE:
+                self.transform_buffer_image,
+            TransformerRule.ADD_DEVICE_AND_DATA_TYPE:
+                self.add_device_and_data_type,
+            TransformerRule.SORT_BY_EXECUTION: self.sort_by_execution,
+        }

        self._option = option
        self._model = model
@@ -67,12 +112,14 @@ class Transformer(base_converter.ConverterInterface):
            self._target_data_format = DataFormat.NCHW

    def run(self):
-        for transformer in self._registered_transformers:
-            while True:
-                self.construct_ops_and_consumers()
-                changed = transformer()
-                if not changed:
-                    break
+        for key in self._registered_transformers_order:
+            if key in self._option.transformer_option:
+                transformer = self._registered_transformers[key]
+                while True:
+                    self.construct_ops_and_consumers()
+                    changed = transformer()
+                    if not changed:
+                        break

        return self._model

@@ -404,19 +451,16 @@ class Transformer(base_converter.ConverterInterface):
                    wt_output_shape.dims.extend(
                        [16, in_channels, wt_output_width, 1])

-                    arg = wt_op.arg.add()
-                    arg.name = 'T'
-                    arg.i = self._option.data_type
-
                    if ConverterUtil.get_arg(op,
                                             MaceKeyword.mace_padding_str) \
                            is not None:
                        padding_arg = wt_op.arg.add()
                        padding_arg.name = MaceKeyword.mace_padding_str
-                        padding_arg.i = ConverterUtil.get_arg(op,
-                                                              MaceKeyword.mace_padding_str).i  # noqa
-                    elif ConverterUtil.get_arg(op,
-                                               MaceKeyword.mace_padding_values_str) is not None:  # noqa
+                        padding_arg.i = ConverterUtil.get_arg(
+                            op, MaceKeyword.mace_padding_str).i
+                    elif ConverterUtil.get_arg(
+                            op, MaceKeyword.mace_padding_values_str)\
+                            is not None:
                        padding_arg = wt_op.arg.add()
                        padding_arg.name = MaceKeyword.mace_padding_values_str
                        padding_arg.ints.extend(ConverterUtil.get_arg(
@@ -432,9 +476,6 @@ class Transformer(base_converter.ConverterInterface):
                    matmul_output_shape.dims.extend(
                        [16, out_channels, wt_output_width, 1])

-                    arg = matmul_op.arg.add()
-                    arg.name = 'T'
-                    arg.i = self._option.data_type
                    arg = matmul_op.arg.add()
                    arg.name = MaceKeyword.mace_winograd_filter_transformed
                    arg.i = 1
@@ -451,9 +492,6 @@ class Transformer(base_converter.ConverterInterface):
                    iwt_output_shape = iwt_op.output_shape.add()
                    iwt_output_shape.dims.extend(op.output_shape[0].dims)

-                    arg = iwt_op.arg.add()
-                    arg.name = 'T'
-                    arg.i = self._option.data_type
                    batch_arg = iwt_op.arg.add()
                    batch_arg.name = 'batch'
                    batch_arg.i = batch
@@ -618,10 +656,6 @@ class Transformer(base_converter.ConverterInterface):
                dims_arg.name = MaceKeyword.mace_dims_str
                dims_arg.ints.extend([0, 3, 1, 2])

-                arg = op.arg.add()
-                arg.name = 'T'
-                arg.i = self._option.data_type
-
            for output_node in self._option.output_nodes.values():
                output_name = MaceKeyword.mace_output_node_name \
                              + '_' + output_node.name
@@ -639,75 +673,43 @@ class Transformer(base_converter.ConverterInterface):
                dims_arg.name = MaceKeyword.mace_dims_str
                dims_arg.ints.extend([0, 2, 3, 1])

-                arg = op.arg.add()
-                arg.name = 'T'
-                arg.i = self._option.data_type
-
        return False

    def transpose_filters(self):
        net = self._model
        filter_format = self.filter_format()

-        # TODO(liyin/liuqi): remove this if-condition after combine cpu/gpu
-        if self._option.device == mace_pb2.CPU:
-            print("Transpose filters to OIHW")
-            # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM)
-            if filter_format == FilterFormat.HWIO:
-                for op in net.op:
-                    if op.type == MaceOp.Conv2D.name \
-                            or op.type == MaceOp.Deconv2D.name \
-                            or op.type == MaceOp.DepthwiseConv2d.name:
-                        if ConverterUtil.get_arg(op,
-                                                 MaceKeyword.mace_winograd_filter_transformed) is None:  # noqa
-                            filter = self._consts[op.input[1]]
-                            filter_data = np.array(filter.float_data).reshape(
-                                filter.dims)
-                            filter_data = filter_data.transpose(3, 2, 0, 1)
-                            filter.float_data[:] = filter_data.flat
-                            filter.dims[:] = filter_data.shape
-            self.set_filter_format(FilterFormat.OIHW)
-
-        elif self._option.device == mace_pb2.GPU:
-            # TODO(liyin/liuqi): remove this whole logic after combine cpu/gpu
-            print("Transpose filters to HWOI/HWIM")
+        print("Transpose filters to OIHW")
+        # transpose filter to OIHW/MIHW for tensorflow (HWIO/HWIM)
+        if filter_format == FilterFormat.HWIO:
            for op in net.op:
                if op.type == MaceOp.Conv2D.name \
                        or op.type == MaceOp.Deconv2D.name \
                        or op.type == MaceOp.DepthwiseConv2d.name:
-                    filter = self._consts[op.input[1]]
-                    filter_data = np.array(filter.float_data).reshape(
-                        filter.dims)
-                    # transpose filter to HWOI/HWIM for
-                    # tensorflow and caffe (OIHW/MIHW)
-                    if filter_format == FilterFormat.HWIO \
-                            and (op.type == MaceOp.Conv2D.name
-                                 or op.type == MaceOp.Deconv2D.name):
-                        filter_data = filter_data.transpose(0, 1, 3, 2)
+                    if ConverterUtil.get_arg(
+                            op, MaceKeyword.mace_winograd_filter_transformed)\
+                            is None:
+                        filter = self._consts[op.input[1]]
+                        filter_data = np.array(filter.float_data).reshape(
+                            filter.dims)
+                        filter_data = filter_data.transpose(3, 2, 0, 1)
                        filter.float_data[:] = filter_data.flat
                        filter.dims[:] = filter_data.shape
-                    elif filter_format == FilterFormat.OIHW:
-                        if op.type == MaceOp.Conv2D.name \
-                                or op.type == MaceOp.Deconv2D.name:
-                            filter_data = filter_data.transpose(2, 3, 0, 1)
-                            filter.float_data[:] = filter_data.flat
-                            filter.dims[:] = filter_data.shape
-                        elif op.type == MaceOp.DepthwiseConv2d.name:
-                            filter_data = filter_data.transpose(2, 3, 1, 0)
-                            filter.float_data[:] = filter_data.flat
-                            filter.dims[:] = filter_data.shape
-
-                if op.type == MaceOp.FullyConnected.name:
-                    weight = self._consts[op.input[1]]
-                    input_shape = list(self._producer[op.input[0]]
-                                       .output_shape[0].dims)
-                    weight_shape = [weight.dims[0]] + input_shape[1:]
-                    # OCHW -> OHWC
-                    weight_data = np.array(weight.float_data).reshape(
-                        weight_shape)
-                    weight_data = weight_data.transpose(0, 2, 3, 1)
-                    weight.float_data[:] = weight_data.flat
-            self.set_filter_format(FilterFormat.HWOI)
+            self.set_filter_format(FilterFormat.OIHW)
+
+        return False
+
+    def reshape_fc_weight(self):
+        net = self._model
+        for op in net.op:
+            if op.type == MaceOp.FullyConnected.name:
+                weight = self._consts[op.input[1]]
+                # NCHW
+                input_shape = list(self._producer[op.input[0]]
+                                   .output_shape[0].dims)
+                weight_shape = [weight.dims[0]] + input_shape[1:]
+                del weight.dims[:]
+                weight.dims.extend(weight_shape)

        return False

@@ -727,9 +729,6 @@ class Transformer(base_converter.ConverterInterface):
        arg = op_def.arg.add()
        arg.name = MaceKeyword.mace_mode
        arg.i = 0
-        arg = op_def.arg.add()
-        arg.name = 'T'
-        arg.i = self._option.data_type

        op.input[input_idx] = output_name

@@ -788,9 +787,6 @@ class Transformer(base_converter.ConverterInterface):
            arg = op_def.arg.add()
            arg.name = MaceKeyword.mace_buffer_type
            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
-            arg = op_def.arg.add()
-            arg.name = 'T'
-            arg.i = self._option.data_type

        for output_node in self._option.output_nodes.values():
            output_name = MaceKeyword.mace_output_node_name \
@@ -806,9 +802,6 @@ class Transformer(base_converter.ConverterInterface):
            arg = op_def.arg.add()
            arg.name = MaceKeyword.mace_buffer_type
            arg.i = OpenCLBufferType.IN_OUT_CHANNEL.value
-            arg = op_def.arg.add()
-            arg.name = 'T'
-            arg.i = self._option.data_type

        return False

@@ -885,6 +878,19 @@ class Transformer(base_converter.ConverterInterface):
                                      in_channels * filter_width
                                      * filter_height][:]

+    def add_device_and_data_type(self):
+        # TODO(liuqi) add device definition in OperatorDef
+        net = self._model
+        for op in net.op:
+            arg = op.arg.add()
+            arg.name = MaceKeyword.mace_device
+            arg.i = self._option.device
+            data_type_arg = op.arg.add()
+            data_type_arg.name = 'T'
+            data_type_arg.i = self._option.data_type
+
+        return False
+
    def sort_dfs(self, op, visited, sorted_nodes):
        visited.update([op.name])
        if len(op.input) > 0:

--- a/mace/python/tools/source_converter_lib.py
+++ b/mace/python/tools/source_converter_lib.py
@@ -167,7 +167,6 @@ def convert_to_source(net_def, model_checksum, weight_checksum, template_dir,
            tensor_info=tensor_info,
            tensor=t,
            tag=model_tag,
-            runtime=runtime,
            offset=offset,
        )
        model_data.extend(tensor_info.data)

--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -55,6 +55,7 @@ void BufferToImage(const std::string &input_name,
                   const std::string &output_name,
                   const int buffer_type,
                   const std::vector<int> &mem_ids,
+                   const DeviceType device_type,
                   NetDef *net_def,
                   const int mode = NetMode::NORMAL) {
  OperatorDef operator_def;
@@ -64,6 +65,7 @@ void BufferToImage(const std::string &input_name,
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .AddIntArg("mode", mode)
      .Finalize(&operator_def);

@@ -76,6 +78,7 @@ template <typename T>
 void ImageToBuffer(const std::string &input_name,
                   const std::string &output_name,
                   const int buffer_type,
+                   const DeviceType device_type,
                   NetDef *net_def) {
  OperatorDef operator_def;

@@ -84,6 +87,7 @@ void ImageToBuffer(const std::string &input_name,
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  net_def->add_op()->CopyFrom(operator_def);
@@ -94,6 +98,7 @@ void Conv3x3(const std::string &input_name,
             const std::string &filter_name,
             const std::string &output_name,
             const std::vector<int> &mem_ids,
+             const DeviceType device_type,
             NetDef *net_def) {
  OperatorDef operator_def;
  ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
@@ -104,6 +109,7 @@ void Conv3x3(const std::string &input_name,
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  operator_def.set_mem_id(mem_ids);
@@ -113,6 +119,7 @@ void Conv3x3(const std::string &input_name,
 template <typename T>
 void Relu(const std::string &input_name,
          const std::string &output_name,
+          const DeviceType device_type,
          NetDef *net_def) {
  OperatorDef operator_def;
  ops::test::OpDefBuilder("Activation", "ReluTest")
@@ -120,6 +127,7 @@ void Relu(const std::string &input_name,
      .Output(output_name)
      .AddStringArg("activation", "RELU")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  net_def->add_op()->CopyFrom(operator_def);
@@ -195,7 +203,8 @@ std::map<std::string, int> AddMemoryOptimization(
    const std::vector<std::vector<int64_t>> &output_shapes,
    NetDef *net_def) {
  std::map<std::string, int> res;
-  int mem_id = 0;
+  // TODO(liuqi) refactor based on PB
+  int mem_id = 20000;
  size_t input_shape_size = input_shapes.size();
  uint32_t in_mem_block_x = 0;
  uint32_t in_mem_block_y = 0;
@@ -269,21 +278,25 @@ void MaceRunFunc(const int in_out_size) {
    BufferToImage<half>(input_name, input_names[i],
                        mace::kernels::IN_OUT_CHANNEL,
                        {mem_map[input_names[i]]},
+                        device,
                        &net_def);
  }
  BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::kernels::CONV2D_FILTER, {},
+                      mace::kernels::CONV2D_FILTER, {}, device,
                      &net_def, NetMode::INIT);
  for (size_t i = 0; i < output_names.size(); ++i) {
    Conv3x3<half>(input_names[i], filter_tensor_img_name,
                  output_names[i], {mem_map[output_names[i]]},
+                  device,
                  &net_def);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
    std::string output_name = MakeString("mace_output_node_",
                                         output_names[i]);
    ImageToBuffer<float>(output_names[i], output_name,
-                         mace::kernels::IN_OUT_CHANNEL, &net_def);
+                         mace::kernels::IN_OUT_CHANNEL,
+                         device,
+                         &net_def);
  }

  const std::string file_path ="/data/local/tmp/mace";

--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -65,6 +65,7 @@ void BufferToImage(const std::string &input_name,
                   const std::string &output_name,
                   const int buffer_type,
                   const std::vector<int> &mem_ids,
+                   const DeviceType device_type,
                   NetDef *net_def,
                   const int mode = NetMode::NORMAL) {
  OperatorDef operator_def;
@@ -74,6 +75,7 @@ void BufferToImage(const std::string &input_name,
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .AddIntArg("mode", mode)
      .Finalize(&operator_def);

@@ -86,6 +88,7 @@ template <typename T>
 void ImageToBuffer(const std::string &input_name,
                   const std::string &output_name,
                   const int buffer_type,
+                   const DeviceType device_type,
                   NetDef *net_def) {
  OperatorDef operator_def;

@@ -94,6 +97,7 @@ void ImageToBuffer(const std::string &input_name,
      .Output(output_name)
      .AddIntArg("buffer_type", buffer_type)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  net_def->add_op()->CopyFrom(operator_def);
@@ -104,6 +108,7 @@ void Conv3x3(const std::string &input_name,
             const std::string &filter_name,
             const std::string &output_name,
             const std::vector<int> &mem_ids,
+             const DeviceType device_type,
             NetDef *net_def) {
  OperatorDef operator_def;
  ops::test::OpDefBuilder("Conv2D", "Conv2dOp")
@@ -114,6 +119,7 @@ void Conv3x3(const std::string &input_name,
      .AddIntArg("padding", Padding::SAME)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  operator_def.set_mem_id(mem_ids);
@@ -123,6 +129,7 @@ void Conv3x3(const std::string &input_name,
 template <typename T>
 void Relu(const std::string &input_name,
          const std::string &output_name,
+          const DeviceType device_type,
          NetDef *net_def) {
  OperatorDef operator_def;
  ops::test::OpDefBuilder("Activation", "ReluTest")
@@ -130,6 +137,7 @@ void Relu(const std::string &input_name,
      .Output(output_name)
      .AddStringArg("activation", "RELU")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddIntArg("device", static_cast<int>(device_type))
      .Finalize(&operator_def);

  net_def->add_op()->CopyFrom(operator_def);
@@ -205,7 +213,8 @@ std::map<std::string, int> AddMemoryOptimization(
    const std::vector<std::vector<int64_t>> &output_shapes,
    NetDef *net_def) {
  std::map<std::string, int> res;
-  int mem_id = 0;
+  // TODO(liuqi) refactor based on PB
+  int mem_id = 20000;
  size_t input_shape_size = input_shapes.size();
  uint32_t in_mem_block_x = 0;
  uint32_t in_mem_block_y = 0;
@@ -279,21 +288,24 @@ void MaceRun(const int in_out_size,
    BufferToImage<half>(input_name, input_names[i],
                        mace::kernels::IN_OUT_CHANNEL,
                        {mem_map[input_names[i]]},
+                        device,
                        &net_def);
  }
  BufferToImage<half>(filter_tensor_name, filter_tensor_img_name,
-                      mace::kernels::CONV2D_FILTER, {},
+                      mace::kernels::CONV2D_FILTER, {}, device,
                      &net_def, NetMode::INIT);
  for (size_t i = 0; i < output_names.size(); ++i) {
    Conv3x3<half>(input_names[i], filter_tensor_img_name,
                  output_names[i], {mem_map[output_names[i]]},
-                  &net_def);
+                  device, &net_def);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
    std::string output_name = MakeString("mace_output_node_",
                                         output_names[i]);
    ImageToBuffer<float>(output_names[i], output_name,
-                         mace::kernels::IN_OUT_CHANNEL, &net_def);
+                         mace::kernels::IN_OUT_CHANNEL,
+                         device,
+                         &net_def);
  }

  MaceEngine engine(&net_def, device, input_names, output_names);

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -62,27 +62,23 @@ def get_target_socs(configs):
        return target_socs


-def get_data_and_device_type(runtime):
-    data_type = ""
+def parse_device_type(runtime):
    device_type = ""

    if runtime == "dsp":
-        data_type = "DT_UINT8"
        device_type = "HEXAGON"
    elif runtime == "gpu":
-        data_type = "DT_HALF"
        device_type = "GPU"
    elif runtime == "cpu":
-        data_type = "DT_FLOAT"
        device_type = "CPU"

-    return data_type, device_type
+    return device_type


 def get_hexagon_mode(configs):
    runtime_list = []
    for model_name in configs["models"]:
-        model_runtime = configs["models"][model_name]["runtime"]
+        model_runtime = configs["models"][model_name].get("runtime", "")
        runtime_list.append(model_runtime.lower())

    global_runtime = ""
@@ -114,7 +110,7 @@ def model_benchmark_stdout_processor(stdout,
                                     abi,
                                     serialno,
                                     model_name,
-                                     runtime):
+                                     device_type):
    metrics = [0] * 3
    for line in stdout.split('\n'):
        line = line.strip()
@@ -138,14 +134,14 @@ def model_benchmark_stdout_processor(stdout,
            f.write("model_name,device_name,soc,abi,runtime,"
                    "init,warmup,run_avg\n")

-    data_str = "{model_name},{device_name},{soc},{abi},{runtime}," \
+    data_str = "{model_name},{device_name},{soc},{abi},{device_type}," \
               "{init},{warmup},{run_avg}\n" \
        .format(
            model_name=model_name,
            device_name=device_name,
            soc=target_soc,
            abi=abi,
-            runtime=runtime,
+            device_type=device_type,
            init=metrics[0],
            warmup=metrics[1],
            run_avg=metrics[2]
@@ -154,8 +150,7 @@ def model_benchmark_stdout_processor(stdout,
        f.write(data_str)


-def tuning_run(runtime,
-               target_abi,
+def tuning_run(target_abi,
               serialno,
               vlog_level,
               embed_model_data,
@@ -205,7 +200,7 @@ def tuning_run(runtime,

    if running_round > 0 and FLAGS.collect_report:
        model_benchmark_stdout_processor(
-            stdout, target_abi, serialno, model_name, runtime)
+            stdout, target_abi, serialno, model_name, device_type)


 def build_mace_run_prod(hexagon_mode, runtime, target_abi,
@@ -222,7 +217,7 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
        strip = "never"
        debug = True

-    if runtime == "gpu":
+    if not runtime or runtime == "gpu":
        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
        sh_commands.bazel_build(
            mace_run_target,
@@ -234,19 +229,14 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
        sh_commands.update_mace_run_lib(model_output_dir,
                                        model_name, embed_model_data)

-        tuning_run(runtime, target_abi, serialno, vlog_level, embed_model_data,
+        device_type = parse_device_type("gpu")
+        tuning_run(target_abi, serialno, vlog_level, embed_model_data,
                   model_output_dir, input_nodes, output_nodes, input_shapes,
                   output_shapes, model_name, device_type, running_round=0,
                   restart_round=1, out_of_range_check=False,
                   phone_data_dir=phone_data_dir, tuning=tuning,
                   limit_opencl_kernel_time=limit_opencl_kernel_time)

-        tuning_run(runtime, target_abi, serialno, vlog_level, embed_model_data,
-                   model_output_dir, input_nodes, output_nodes, input_shapes,
-                   output_shapes, model_name, device_type, running_round=0,
-                   restart_round=1, out_of_range_check=True,
-                   phone_data_dir=phone_data_dir, tuning=False)
-
        gen_opencl_and_tuning_code(target_abi, serialno, [model_output_dir],
                                   True)
        sh_commands.bazel_build(
@@ -391,8 +381,7 @@ def parse_model_configs():
                print("'platform' must be 'tensorflow' or 'caffe'")
                exit(1)

-            for key in ["model_file_path", "model_sha256_checksum",
-                        "runtime"]:
+            for key in ["model_file_path", "model_sha256_checksum"]:
                value = model_config.get(key, "")
                if value == "":
                    print("CONFIG ERROR:")
@@ -529,6 +518,11 @@ def parse_args():
        type=str,
        default="",
        help="Valgrind command args.")
+    parser.add_argument(
+        "--validation_runtime",
+        type=str,
+        default="cpu",
+        help="validation runtime.")
    return parser.parse_known_args()


@@ -541,9 +535,11 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
        print '===================', model_name, '==================='
        model_config = configs["models"][model_name]
        input_file_list = model_config["validation_inputs_data"]
-        data_type, device_type = get_data_and_device_type(
-                model_config["runtime"])
-
+        model_runtime = model_config.get("runtime", "")
+        model_device_type = parse_device_type(model_runtime)
+        run_device_type = model_device_type
+        if not run_device_type:
+            run_device_type = parse_device_type(FLAGS.validation_runtime)
        # Create model build directory
        model_path_digest = md5sum(model_config["model_file_path"])
        model_output_base_dir = "%s/%s/%s/%s/%s" % (
@@ -581,7 +577,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,

        if FLAGS.mode == "build" or FLAGS.mode == "all":
            build_mace_run_prod(hexagon_mode,
-                                model_config["runtime"],
+                                model_runtime,
                                target_abi,
                                serialno,
                                vlog_level,
@@ -592,7 +588,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                model_config["input_shapes"],
                                model_config["output_shapes"],
                                model_name,
-                                device_type,
+                                model_device_type,
                                FLAGS.round,
                                FLAGS.restart_round,
                                FLAGS.tuning,
@@ -607,8 +603,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,

        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
           FLAGS.mode == "all":
-            tuning_run(model_config["runtime"],
-                       target_abi,
+            tuning_run(target_abi,
                       serialno,
                       vlog_level,
                       embed_model_data,
@@ -618,7 +613,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                       model_config["input_shapes"],
                       model_config["output_shapes"],
                       model_name,
-                       device_type,
+                       run_device_type,
                       FLAGS.round,
                       FLAGS.restart_round,
                       FLAGS.out_of_range_check,
@@ -641,7 +636,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                        model_config["input_shapes"],
                                        model_config["output_shapes"],
                                        model_name,
-                                        device_type,
+                                        run_device_type,
                                        phone_data_dir,
                                        FLAGS.omp_num_threads,
                                        FLAGS.cpu_affinity_policy,
@@ -654,7 +649,7 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                       model_file_path,
                                       weight_file_path,
                                       model_config["platform"],
-                                       model_config["runtime"],
+                                       run_device_type,
                                       model_config["input_nodes"],
                                       model_config["output_nodes"],
                                       model_config["input_shapes"],
@@ -746,8 +741,7 @@ def main(unused_args):
        for model_name in configs["models"]:
            print '===================', model_name, '==================='
            model_config = configs["models"][model_name]
-            data_type, device_type = get_data_and_device_type(
-                model_config["runtime"])
+            runtime = model_config.get("runtime", "")

            # Create model build directory
            model_path_digest = md5sum(model_config["model_file_path"])
@@ -778,8 +772,7 @@ def main(unused_args):
                model_config["model_sha256_checksum"],
                ",".join(model_config["input_nodes"]),
                ",".join(model_config["output_nodes"]),
-                data_type,
-                model_config["runtime"],
+                runtime,
                model_name,
                ":".join(model_config["input_shapes"]),
                model_config["dsp_mode"],

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -465,7 +465,6 @@ def gen_model_code(model_codegen_dir,
                   model_sha256_checksum,
                   input_nodes,
                   output_nodes,
-                   data_type,
                   runtime,
                   model_tag,
                   input_shapes,
@@ -489,7 +488,6 @@ def gen_model_code(model_codegen_dir,
                  "--output=%s" % model_codegen_dir + "/model.cc",
                  "--input_node=%s" % input_nodes,
                  "--output_node=%s" % output_nodes,
-                  "--data_type=%s" % data_type,
                  "--runtime=%s" % runtime,
                  "--output_type=source",
                  "--template=%s" % "mace/python/tools",
@@ -703,7 +701,7 @@ def validate_model(abi,
                   model_file_path,
                   weight_file_path,
                   platform,
-                   runtime,
+                   device_type,
                   input_nodes,
                   output_nodes,
                   input_shapes,
@@ -727,7 +725,7 @@ def validate_model(abi,
    if platform == "tensorflow":
        validate(platform, model_file_path, "",
                 "%s/%s" % (model_output_dir, input_file_name),
-                 "%s/%s" % (model_output_dir, output_file_name), runtime,
+                 "%s/%s" % (model_output_dir, output_file_name), device_type,
                 ":".join(input_shapes), ":".join(output_shapes),
                 ",".join(input_nodes), ",".join(output_nodes))
    elif platform == "caffe":
@@ -743,7 +741,8 @@ def validate_model(abi,
                logger.error('There is no caffe python module.')
            validate(platform, model_file_path, weight_file_path,
                     "%s/%s" % (model_output_dir, input_file_name),
-                     "%s/%s" % (model_output_dir, output_file_name), runtime,
+                     "%s/%s" % (model_output_dir, output_file_name),
+                     device_type,
                     ":".join(input_shapes), ":".join(output_shapes),
                     ",".join(input_nodes), ",".join(output_nodes))
        elif caffe_env == common.CaffeEnvType.DOCKER:
@@ -806,7 +805,7 @@ def validate_model(abi,
                    "--weight_file=/mace/%s" % weight_file_name,
                    "--input_file=/mace/%s" % input_file_name,
                    "--mace_out_file=/mace/%s" % output_file_name,
-                    "--mace_runtime=%s" % runtime,
+                    "--device_type=%s" % device_type,
                    "--input_node=%s" % ",".join(input_nodes),
                    "--output_node=%s" % ",".join(output_nodes),
                    "--input_shape=%s" % ":".join(input_shapes),

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -44,7 +44,7 @@ def load_data(file):
        return np.empty([0])


-def compare_output(platform, mace_runtime, output_name, mace_out_value,
+def compare_output(platform, device_type, output_name, mace_out_value,
                   out_value):
    if mace_out_value.size != 0:
        out_value = out_value.reshape(-1)
@@ -53,9 +53,9 @@ def compare_output(platform, mace_runtime, output_name, mace_out_value,
        similarity = (1 - spatial.distance.cosine(out_value, mace_out_value))
        print output_name, 'MACE VS', platform.upper(
        ), 'similarity: ', similarity
-        if (mace_runtime == "cpu" and similarity > 0.999) or \
-            (mace_runtime == "gpu" and similarity > 0.995) or \
-                (mace_runtime == "dsp" and similarity > 0.930):
+        if (device_type == "CPU" and similarity > 0.999) or \
+            (device_type == "GPU" and similarity > 0.995) or \
+                (device_type == "HEXAGON" and similarity > 0.930):
            print '===================Similarity Test Passed=================='
        else:
            print '===================Similarity Test Failed=================='
@@ -65,7 +65,7 @@ def compare_output(platform, mace_runtime, output_name, mace_out_value,
        sys.exit(-1)


-def validate_tf_model(platform, mace_runtime, model_file, input_file,
+def validate_tf_model(platform, device_type, model_file, input_file,
                      mace_out_file, input_names, input_shapes, output_names):
    import tensorflow as tf
    if not os.path.isfile(model_file):
@@ -100,11 +100,11 @@ def validate_tf_model(platform, mace_runtime, model_file, input_file,
                    output_file_name = common.formatted_file_name(
                        mace_out_file, output_names[i])
                    mace_out_value = load_data(output_file_name)
-                    compare_output(platform, mace_runtime, output_names[i],
+                    compare_output(platform, device_type, output_names[i],
                                   mace_out_value, output_values[i])


-def validate_caffe_model(platform, mace_runtime, model_file, input_file,
+def validate_caffe_model(platform, device_type, model_file, input_file,
                         mace_out_file, weight_file, input_names, input_shapes,
                         output_names, output_shapes):
    os.environ['GLOG_minloglevel'] = '1'  # suprress Caffe verbose prints
@@ -144,12 +144,12 @@ def validate_caffe_model(platform, mace_runtime, model_file, input_file,
        output_file_name = common.formatted_file_name(
            mace_out_file, output_names[i])
        mace_out_value = load_data(output_file_name)
-        compare_output(platform, mace_runtime, output_names[i], mace_out_value,
+        compare_output(platform, device_type, output_names[i], mace_out_value,
                       value)


 def validate(platform, model_file, weight_file, input_file, mace_out_file,
-             mace_runtime, input_shape, output_shape, input_node, output_node):
+             device_type, input_shape, output_shape, input_node, output_node):
    input_names = [name for name in input_node.split(',')]
    input_shape_strs = [shape for shape in input_shape.split(':')]
    input_shapes = [[int(x) for x in shape.split(',')]
@@ -158,14 +158,14 @@ def validate(platform, model_file, weight_file, input_file, mace_out_file,
    assert len(input_names) == len(input_shapes)

    if platform == 'tensorflow':
-        validate_tf_model(platform, mace_runtime, model_file, input_file,
+        validate_tf_model(platform, device_type, model_file, input_file,
                          mace_out_file, input_names, input_shapes,
                          output_names)
    elif platform == 'caffe':
        output_shape_strs = [shape for shape in output_shape.split(':')]
        output_shapes = [[int(x) for x in shape.split(',')]
                         for shape in output_shape_strs]
-        validate_caffe_model(platform, mace_runtime, model_file, input_file,
+        validate_caffe_model(platform, device_type, model_file, input_file,
                             mace_out_file, weight_file, input_names,
                             input_shapes, output_names, output_shapes)

@@ -194,7 +194,7 @@ def parse_args():
        default="",
        help="mace output file to load.")
    parser.add_argument(
-        "--mace_runtime", type=str, default="gpu", help="mace runtime device.")
+        "--device_type", type=str, default="", help="mace runtime device.")
    parser.add_argument(
        "--input_shape", type=str, default="1,64,64,3", help="input shape.")
    parser.add_argument(
@@ -214,7 +214,7 @@ if __name__ == '__main__':
             FLAGS.weight_file,
             FLAGS.input_file,
             FLAGS.mace_out_file,
-             FLAGS.mace_runtime,
+             FLAGS.device_type,
             FLAGS.input_shape,
             FLAGS.output_shape,
             FLAGS.input_node,