diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index b4644e652f2c2a82660e6dfd31bd74f3208383b0..e58de0d317869944b13ba597e7f691f63d6bcda3 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -215,8 +215,8 @@ DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs"); DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); -DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(omp_num_threads, 4, "num of openmp threads"); DEFINE_int32(cpu_power_option, 0, "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 79093fa177b2d7a7ee188e47c3e2a368c080c8a4..3f9314661e69937758459ebc0be0364e91e7645b 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -179,7 +179,7 @@ void OpenCLProfilingTimer::ClearTiming() { accumulated_micros_ = 0; } -GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_DEFAULT; +GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_NORMAL; GPUPriorityHint OpenCLRuntime::kGPUPriorityHint = GPUPriorityHint::PRIORITY_DEFAULT; diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py index 38ff7da337607d6d6cb06fb6f720f19aa4edf67d..6f8a95cd603e60764094e3d0bdbedd0355798fd6 100644 --- a/mace/python/tools/caffe_converter_lib.py +++ b/mace/python/tools/caffe_converter_lib.py @@ -450,18 +450,6 @@ class CaffeConverter(object): final_op.output_shape_map[final_op.layer.top[0]] = output_shape self.resolved_ops.add(activation_op.name) - if op_def.type in ("Conv2D", "FusedConv2D") and \ - output_shape[2] == 1 and \ - ((input_format == 'NCHW' and output_shape[3] == 1) or - (input_format == 'NHWC' and output_shape[1] == 1)): - print "convert op %s from CONV to FC" % op.name - op_def.type = 'FC' - filter_shape = weight_data.shape - new_shape = [filter_shape[0], - filter_shape[1] * filter_shape[2] * filter_shape[3], - 1, 1] - weight_data.reshape(new_shape) - op_def.output.extend([final_op.name + ':0']) self.add_output_shape(op_def, output_shape) self.net_def.op.extend([op_def]) diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 14ddf8810613f8e6098041ecfd7a28fa54a17ab5..a721e0fd2b849bea84a9c0e07a557078c5a6e217 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -363,17 +363,15 @@ class TFConverter(object): op_def.name = op.name if op.type == 'DepthwiseConv2dNative': op_def.type = 'DepthwiseConv2d' - if self.device == 'neon': - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (3, 2, 0, 1) else: op_def.type = op.type - if self.device == 'neon': - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (3, 2, 0, 1) - else: - self.transpose_filter_tensor[get_input_tensor( - op, 1).name] = (0, 1, 3, 2) + + if self.device == 'neon': + self.transpose_filter_tensor[get_input_tensor( + op, 1).name] = (3, 2, 0, 1) + elif op.type == 'Conv2D': + self.transpose_filter_tensor[get_input_tensor( + op, 1).name] = (0, 1, 3, 2) if self.device == 'gpu': op_def.input.extend([op.inputs[0].name]) if op_def.type == 'DepthwiseConv2d': @@ -402,21 +400,71 @@ class TFConverter(object): final_op = op self.resolved_ops[op.name] = 1 - # convert global conv to fc + if len(self.tf_graph.get(op.name, [])) == 1 and \ + self.tf_graph[op.name][0].type == 'BiasAdd': + bias_add_op = self.tf_graph[op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image( + get_input_tensor(bias_add_op, 1).name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([get_input_tensor(bias_add_op, 1).name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph.get(final_op.name, [])) == 1 and \ + self.tf_graph[final_op.name][0].type in activation_name_map: + activation_op = self.tf_graph[final_op.name][0] + if op_def.type == "Conv2D": + op_def.type = "FusedConv2D" + fused_act_arg = op_def.arg.add() + fused_act_arg.name = 'activation' + fused_act_arg.s = activation_name_map[activation_op.type] + if activation_op.type == 'Relu6': + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + final_op = activation_op + self.resolved_ops[activation_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def check_conv_to_fc(self, op): + if self.device != 'neon' or op.type != "Conv2D": + return False filter_shape = get_input_tensor(op, 1).shape.as_list() input_shape = get_input_tensor(op, 0).shape.as_list() - if op_def.type == "Conv2D" and input_shape[1] == filter_shape[0] and \ - input_shape[2] == filter_shape[1] and \ - (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1 and - filter_shape[1] == 1): - print "convert op %s from CONV to FC" % op.name - op_def.type = 'FC' - self.reshape_tensor[get_input_tensor(op, 1).name] = \ - [filter_shape[3], - filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1] + return input_shape[1] == filter_shape[0] \ + and input_shape[2] == filter_shape[1] \ + and (op.get_attr('padding') == 'VALID' or filter_shape[0] == 1 + and filter_shape[1] == 1) + + def convert_global_conv_to_fc(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'FC' + self.transpose_filter_tensor[get_input_tensor(op, 1).name] = \ + (3, 2, 0, 1) + filter_shape = get_input_tensor(op, 1).shape.as_list() + self.reshape_tensor[get_input_tensor(op, 1).name] = \ + [filter_shape[3], + filter_shape[2] * filter_shape[1] * filter_shape[0], 1, 1] + op_def.input.extend( + [get_input_tensor(op, i).name for i in range(len(op.inputs))]) + + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NCHW' + final_op = op + self.resolved_ops[op.name] = 1 if len(self.tf_graph.get(op.name, [])) == 1 and \ - self.tf_graph[op.name][0].type == 'BiasAdd': + self.tf_graph[op.name][0].type == 'BiasAdd': bias_add_op = self.tf_graph[op.name][0] if self.device == 'gpu': output_name = self.add_buffer_to_image( @@ -428,10 +476,8 @@ class TFConverter(object): self.resolved_ops[bias_add_op.name] = 1 if len(self.tf_graph.get(final_op.name, [])) == 1 and \ - self.tf_graph[final_op.name][0].type in activation_name_map: + self.tf_graph[final_op.name][0].type in activation_name_map: activation_op = self.tf_graph[final_op.name][0] - if op_def.type == "Conv2D": - op_def.type = "FusedConv2D" fused_act_arg = op_def.arg.add() fused_act_arg.name = 'activation' fused_act_arg.s = activation_name_map[activation_op.type] @@ -985,6 +1031,8 @@ class TFConverter(object): self.convert_reshape(op) elif self.is_atrous_conv2d(op): self.convert_atrous_conv2d(op) + elif self.check_conv_to_fc(op): + self.convert_global_conv_to_fc(op) elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': if self.check_winograd_conv(op): self.convert_winograd_conv(op) diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 7e0f2c7025d88cb9b0348f79ed2f3f1725946cd4..8d83b03d59f9aa63700c5dadd026049515eff071 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -188,8 +188,8 @@ DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); -DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(omp_num_threads, 4, "num of openmp threads"); DEFINE_int32(cpu_power_option, 0, diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 9d848f9e89c0174bc99474df0802ee3a9739b69a..4c3d16a2b09fdd62f60d00e81f5eab020fcd825b 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -30,9 +30,13 @@ def adb_split_stdout(stdout_str): def adb_devices(target_socs=None): - outputs = sh.grep(sh.adb("devices"), "^[A-Za-z0-9]\+[[:space:]]\+device$") - raw_lists = sh.cut(outputs, "-f1") - device_ids = adb_split_stdout(raw_lists) + device_ids = [] + p = re.compile(r'(\w+)\s+device') + for line in adb_split_stdout(sh.adb("devices")): + m = p.match(line) + if m: + device_ids.append(m.group(1)) + if target_socs is not None: target_socs_set = set(target_socs) target_devices = [] @@ -49,7 +53,7 @@ def adb_getprop_by_serialno(serialno): outputs = sh.adb("-s", serialno, "shell", "getprop") raw_props = adb_split_stdout(outputs) props = {} - p = re.compile("\[(.+)\]: \[(.+)\]") + p = re.compile(r'\[(.+)\]: \[(.+)\]') for raw_prop in raw_props: m = p.match(raw_prop) if m: