diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index b4644e652f2c2a82660e6dfd31bd74f3208383b0..e58de0d317869944b13ba597e7f691f63d6bcda3 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -215,8 +215,8 @@ DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs");
 DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
 DEFINE_string(model_data_file, "",
               "model data file name, used when EMBED_MODEL_DATA set to 0");
-DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, 4, "num of openmp threads");
 DEFINE_int32(cpu_power_option, 0,
              "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 79093fa177b2d7a7ee188e47c3e2a368c080c8a4..3f9314661e69937758459ebc0be0364e91e7645b 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -179,7 +179,7 @@ void OpenCLProfilingTimer::ClearTiming() {
   accumulated_micros_ = 0;
 }
 
-GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_DEFAULT;
+GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL;
 GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
     GPUPriorityHint::PRIORITY_DEFAULT;
 
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
index 38ff7da337607d6d6cb06fb6f720f19aa4edf67d..6f8a95cd603e60764094e3d0bdbedd0355798fd6 100644
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -450,18 +450,6 @@ class CaffeConverter(object):
             final_op.output_shape_map[final_op.layer.top[0]] = output_shape
             self.resolved_ops.add(activation_op.name)
 
-        if op_def.type in ("Conv2D", "FusedConv2D") and \
-                output_shape[2] == 1 and \
-                ((input_format == 'NCHW' and output_shape[3] == 1) or
-                 (input_format == 'NHWC' and output_shape[1] == 1)):
-            print "convert op %s from CONV to FC" % op.name
-            op_def.type = 'FC'
-            filter_shape = weight_data.shape
-            new_shape = [filter_shape[0],
-                         filter_shape[1] * filter_shape[2] * filter_shape[3],
-                         1, 1]
-            weight_data.reshape(new_shape)
-
         op_def.output.extend([final_op.name + ':0'])
         self.add_output_shape(op_def, output_shape)
         self.net_def.op.extend([op_def])
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index 14ddf8810613f8e6098041ecfd7a28fa54a17ab5..a721e0fd2b849bea84a9c0e07a557078c5a6e217 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -363,17 +363,15 @@ class TFConverter(object):
         op_def.name = op.name
         if op.type == 'DepthwiseConv2dNative':
             op_def.type = 'DepthwiseConv2d'
-            if self.device == 'neon':
-                self.transpose_filter_tensor[get_input_tensor(
-                    op, 1).name] = (3, 2, 0, 1)
         else:
             op_def.type = op.type
-            if self.device == 'neon':
-                self.transpose_filter_tensor[get_input_tensor(
-                    op, 1).name] = (3, 2, 0, 1)
-            else:
-                self.transpose_filter_tensor[get_input_tensor(
-                    op, 1).name] = (0, 1, 3, 2)
+
+        if self.device == 'neon':
+            self.transpose_filter_tensor[get_input_tensor(
+                op, 1).name] = (3, 2, 0, 1)
+        elif op.type == 'Conv2D':
+            self.transpose_filter_tensor[get_input_tensor(
+                op, 1).name] = (0, 1, 3, 2)
         if self.device == 'gpu':
             op_def.input.extend([op.inputs[0].name])
             if op_def.type == 'DepthwiseConv2d':
@@ -402,21 +400,71 @@ class TFConverter(object):
         final_op = op
         self.resolved_ops[op.name] = 1
 
-        # convert global conv to fc
+        if len(self.tf_graph.get(op.name, [])) == 1 and \
+           self.tf_graph[op.name][0].type == 'BiasAdd':
+            bias_add_op = self.tf_graph[op.name][0]
+            if self.device == 'gpu':
+                output_name = self.add_buffer_to_image(
+                    get_input_tensor(bias_add_op, 1).name, "ARGUMENT")
+                op_def.input.extend([output_name])
+            else:
+                op_def.input.extend([get_input_tensor(bias_add_op, 1).name])
+            final_op = bias_add_op
+            self.resolved_ops[bias_add_op.name] = 1
+
+        if len(self.tf_graph.get(final_op.name, [])) == 1 and \
+           self.tf_graph[final_op.name][0].type in activation_name_map:
+            activation_op = self.tf_graph[final_op.name][0]
+            if op_def.type == "Conv2D":
+                op_def.type = "FusedConv2D"
+            fused_act_arg = op_def.arg.add()
+            fused_act_arg.name = 'activation'
+            fused_act_arg.s = activation_name_map[activation_op.type]
+            if activation_op.type == 'Relu6':
+                max_limit_arg = op_def.arg.add()
+                max_limit_arg.name = 'max_limit'
+                max_limit_arg.f = 6
+            final_op = activation_op
+            self.resolved_ops[activation_op.name] = 1
+
+        op_def.output.extend([output.name for output in final_op.outputs])
+        self.add_output_shape(final_op.outputs, op_def)
+        self.net_def.op.extend([op_def])
+
+    def check_conv_to_fc(self, op):
+        if self.device != 'neon' or op.type != "Conv2D":
+            return False
         filter_shape = get_input_tensor(op, 1).shape.as_list()
         input_shape = get_input_tensor(op, 0).shape.as_list()
-        if op_def.type == "Conv2D" and input_shape[1] == filter_shape[0] and \
-                input_shape[2] == filter_shape[1] and \
-                (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1 and
-                 filter_shape[1] == 1):
-            print "convert op %s from CONV to FC" % op.name
-            op_def.type = 'FC'
-            self.reshape_tensor[get_input_tensor(op, 1).name] = \
-                [filter_shape[3],
-                 filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1]
+        return input_shape[1] == filter_shape[0] \
+            and input_shape[2] == filter_shape[1] \
+            and (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1
+                 and filter_shape[1] == 1)
+
+    def convert_global_conv_to_fc(self, op):
+        op_def = mace_pb2.OperatorDef()
+        arg = op_def.arg.add()
+        arg.name = 'T'
+        arg.i = self.dt
+        op_def.name = op.name
+        op_def.type = 'FC'
+        self.transpose_filter_tensor[get_input_tensor(op, 1).name] = \
+            (3, 2, 0, 1)
+        filter_shape = get_input_tensor(op, 1).shape.as_list()
+        self.reshape_tensor[get_input_tensor(op, 1).name] = \
+            [filter_shape[3],
+             filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1]
+        op_def.input.extend(
+            [get_input_tensor(op, i).name for i in range(len(op.inputs))])
+
+        data_format_arg = op_def.arg.add()
+        data_format_arg.name = 'data_format'
+        data_format_arg.s = 'NCHW'
+        final_op = op
+        self.resolved_ops[op.name] = 1
 
         if len(self.tf_graph.get(op.name, [])) == 1 and \
-                self.tf_graph[op.name][0].type == 'BiasAdd':
+           self.tf_graph[op.name][0].type == 'BiasAdd':
             bias_add_op = self.tf_graph[op.name][0]
             if self.device == 'gpu':
                 output_name = self.add_buffer_to_image(
@@ -428,10 +476,8 @@ class TFConverter(object):
             self.resolved_ops[bias_add_op.name] = 1
 
         if len(self.tf_graph.get(final_op.name, [])) == 1 and \
-                self.tf_graph[final_op.name][0].type in activation_name_map:
+           self.tf_graph[final_op.name][0].type in activation_name_map:
             activation_op = self.tf_graph[final_op.name][0]
-            if op_def.type == "Conv2D":
-                op_def.type = "FusedConv2D"
             fused_act_arg = op_def.arg.add()
             fused_act_arg.name = 'activation'
             fused_act_arg.s = activation_name_map[activation_op.type]
@@ -985,6 +1031,8 @@ class TFConverter(object):
                 self.convert_reshape(op)
             elif self.is_atrous_conv2d(op):
                 self.convert_atrous_conv2d(op)
+            elif self.check_conv_to_fc(op):
+                self.convert_global_conv_to_fc(op)
             elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
                 if self.check_winograd_conv(op):
                     self.convert_winograd_conv(op)
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 7e0f2c7025d88cb9b0348f79ed2f3f1725946cd4..8d83b03d59f9aa63700c5dadd026049515eff071 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -188,8 +188,8 @@ DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
 DEFINE_int32(round, 1, "round");
 DEFINE_int32(restart_round, 1, "restart round");
 DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
-DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, 4, "num of openmp threads");
 DEFINE_int32(cpu_power_option,
              0,
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 9d848f9e89c0174bc99474df0802ee3a9739b69a..4c3d16a2b09fdd62f60d00e81f5eab020fcd825b 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -30,9 +30,13 @@ def adb_split_stdout(stdout_str):
 
 
 def adb_devices(target_socs=None):
-    outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$")
-    raw_lists = sh.cut(outputs, "-f1")
-    device_ids = adb_split_stdout(raw_lists)
+    device_ids = []
+    p = re.compile(r'(\w+)\s+device')
+    for line in adb_split_stdout(sh.adb("devices")):
+        m = p.match(line)
+        if m:
+            device_ids.append(m.group(1))
+
     if target_socs is not None:
         target_socs_set = set(target_socs)
         target_devices = []
@@ -49,7 +53,7 @@ def adb_getprop_by_serialno(serialno):
     outputs = sh.adb("-s", serialno, "shell", "getprop")
     raw_props = adb_split_stdout(outputs)
     props = {}
-    p = re.compile("\[(.+)\]: \[(.+)\]")
+    p = re.compile(r'\[(.+)\]: \[(.+)\]')
     for raw_prop in raw_props:
         m = p.match(raw_prop)
         if m: