diff --git a/docs/faq.md b/docs/faq.md
index 90b61dadf5ceb5d84b13d53426527356f13e4de5..3b3ccf580b4d2425613cda1e2d90b9638aa189f7 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -61,6 +61,7 @@ Running models on Hexagon DSP need a few prerequisites for DSP developers:
 
 * You need to make sure SOCs of your phone is manufactured by Qualcomm and has HVX supported.
 * You need a phone that disables secure boot (once enabled, cannot be reversed, so you probably can only get that type phones from manufacturers)
+* You need to root your phone.
 * You need to sign your phone by using testsig provided by Qualcomm. (Download Qualcomm Hexagon SDK first, plugin your phone to PC, run scripts/testsig.py)
 * You need to push `third_party/nnlib/v6x/libhexagon_nn_skel.so` to `/system/vendor/lib/rfsa/adsp/`.
 
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 445b7dc382298e455d5e3eb77b9575072cb612dd..da88f25ae92581712f094513ea1f6ba811841b4f 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -169,8 +169,8 @@ DEFINE_string(output_node, "output_node0,output_node1",
 DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
 DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
 DEFINE_string(input_file, "", "input file name");
-DEFINE_int32(max_num_runs, 100, "number of runs max");
-DEFINE_string(max_time, "10.0", "length to run max");
+DEFINE_int32(max_num_runs, 100, "max number of runs");
+DEFINE_double(max_seconds, 10.0, "max number of seconds to run");
 DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
 DEFINE_string(opencl_binary_file,
               "",
@@ -209,14 +209,10 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]";
   LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]";
   LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]";
-  LOG(INFO) << "Max run time: [" << FLAGS_max_time << "]";
-
-  const double max_benchmark_time_seconds =
-      std::strtod(FLAGS_max_time.c_str(), nullptr);
+  LOG(INFO) << "Max run seconds: [" << FLAGS_max_seconds << "]";
 
   std::unique_ptr<OpStat> statistician(new OpStat());
 
-
   std::vector<std::string> input_names =
       str_util::Split(FLAGS_input_node, ',');
   std::vector<std::string> output_names =
@@ -365,7 +361,7 @@ int Main(int argc, char **argv) {
   int64_t no_stat_runs = 0;
   bool status =
       Run("Run without statistics", engine.get(), inputs, &outputs,
-          FLAGS_max_num_runs, max_benchmark_time_seconds,
+          FLAGS_max_num_runs, FLAGS_max_seconds,
           &no_stat_time_us, &no_stat_runs, nullptr);
   if (!status) {
     LOG(ERROR) << "Failed at normal no-stat run";
@@ -374,7 +370,7 @@ int Main(int argc, char **argv) {
   int64_t stat_time_us = 0;
   int64_t stat_runs = 0;
   status = Run("Run with statistics", engine.get(), inputs, &outputs,
-               FLAGS_max_num_runs, max_benchmark_time_seconds,
+               FLAGS_max_num_runs, FLAGS_max_seconds,
                &stat_time_us, &stat_runs, statistician.get());
   if (!status) {
     LOG(ERROR) << "Failed at normal stat run";
diff --git a/tools/converter.py b/tools/converter.py
index c994ed0d051ea0c668006dc15d58462062795638..b3a6569638137b52d25d3ac40b246eb8aba3bf8c 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -1282,6 +1282,16 @@ def parse_args():
         parents=[all_type_parent_parser, run_bm_parent_parser],
         help='benchmark model for detail information')
     benchmark.set_defaults(func=benchmark_model)
+    benchmark.add_argument(
+        "--max_num_runs",
+        type=int,
+        default=100,
+        help="max number of runs.")
+    benchmark.add_argument(
+        "--max_seconds",
+        type=float,
+        default=10.0,
+        help="max number of seconds to run.")
     return parser.parse_known_args()
 
 
diff --git a/tools/device.py b/tools/device.py
index d90e1907fe4c85c7642bc70bd69e118a5e1eaa4f..ccc174f38c6d592d232dc6def0d3ff0759e38095 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -743,6 +743,8 @@ class DeviceWrapper:
                         output_nodes,
                         input_shapes,
                         output_shapes,
+                        max_num_runs,
+                        max_seconds,
                         model_tag,
                         device_type,
                         model_graph_format,
@@ -756,7 +758,6 @@ class DeviceWrapper:
                         input_file_name='model_input',
                         link_dynamic=False):
         six.print_('* Benchmark for %s' % model_tag)
-
         mace_model_path = ''
         if model_graph_format == ModelFormat.file:
             mace_model_path = '%s/%s.pb' % (mace_model_dir, model_tag)
@@ -784,6 +785,8 @@ class DeviceWrapper:
                     '--output_shape=%s' % ':'.join(output_shapes),
                     '--input_file=%s/%s' % (model_output_dir, input_file_name),
                     "--model_data_file=%s" % model_data_file,
+                    '--max_num_runs=%d' % max_num_runs,
+                    '--max_seconds=%f' % max_seconds,
                     '--device=%s' % device_type,
                     '--omp_num_threads=%s' % omp_num_threads,
                     '--cpu_affinity_policy=%s' % cpu_affinity_policy,
@@ -837,6 +840,8 @@ class DeviceWrapper:
                 '--output_shape=%s' % ':'.join(output_shapes),
                 '--input_file=%s/%s' % (self.data_dir, input_file_name),
                 "--model_data_file=%s" % model_data_file,
+                '--max_num_runs=%d' % max_num_runs,
+                '--max_seconds=%f' % max_seconds,
                 '--device=%s' % device_type,
                 '--omp_num_threads=%s' % omp_num_threads,
                 '--cpu_affinity_policy=%s' % cpu_affinity_policy,
@@ -932,6 +937,12 @@ class DeviceWrapper:
                 runtime_list.append(model_runtime)
             for runtime in runtime_list:
                 device_type = parse_device_type(runtime)
+                if not subgraphs[0][YAMLKeyword.check_tensors]:
+                    output_nodes = subgraphs[0][YAMLKeyword.output_tensors]
+                    output_shapes = subgraphs[0][YAMLKeyword.output_shapes]
+                else:
+                    output_nodes = subgraphs[0][YAMLKeyword.check_tensors]
+                    output_shapes = subgraphs[0][YAMLKeyword.check_shapes]
                 self.benchmark_model(
                     abi=target_abi,
                     benchmark_binary_dir=build_tmp_binary_dir,
@@ -940,9 +951,11 @@ class DeviceWrapper:
                     embed_model_data=embed_model_data,
                     model_output_dir=model_output_dir,
                     input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-                    output_nodes=subgraphs[0][YAMLKeyword.output_tensors],
+                    output_nodes=output_nodes,
                     input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-                    output_shapes=subgraphs[0][YAMLKeyword.output_shapes],
+                    output_shapes=output_shapes,
+                    max_num_runs=flags.max_num_runs,
+                    max_seconds=flags.max_seconds,
                     mace_model_dir=mace_model_dir,
                     model_tag=model_name,
                     device_type=device_type,
@@ -1041,6 +1054,8 @@ class DeviceManager:
         adb_list = [tuple(pair.split('\t')) for pair in adb_list]
         devices = []
         for adb in adb_list:
+            if adb[1].startswith("no permissions"):
+                continue
             prop = sh_commands.adb_getprop_by_serialno(adb[0])
             android = {
                 YAMLKeyword.device_name: