Mock runtime failure

1097312d · 李寅 · 2940ce5d · 1097312d · 1097312d · 1097312d
8 changed file
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -25,7 +25,9 @@
 #include "mace/core/macros.h"
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
+#include "mace/core/runtime_failure_mock.h"
 #include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
 namespace mace {
@@ -65,6 +67,11 @@ class CPUAllocator : public Allocator {
    if (nbytes == 0) {
      return MaceStatus::MACE_SUCCESS;
    }
+    if (ShouldMockRuntimeFailure()) {
+      return MaceStatus::MACE_OUT_OF_RESOURCES;
+    }
    void *data = nullptr;
 #if defined(__ANDROID__) || defined(__hexagon__)
    data = memalign(kMaceAlignment, nbytes);

--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -121,14 +121,16 @@ MaceEngine::Impl::Impl(DeviceType device_type)
 #ifdef MACE_ENABLE_HEXAGON
      , hexagon_controller_(nullptr)
 #endif
-{}
+{
+  LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
+}
 MaceStatus MaceEngine::Impl::Init(
    const NetDef *net_def,
    const std::vector<std::string> &input_nodes,
    const std::vector<std::string> &output_nodes,
    const unsigned char *model_data) {
-  LOG(INFO) << "MACE version: " << MaceVersion();
+  LOG(INFO) << "Initializing MaceEngine";
  // Set storage path for internal usage
  for (auto input_name : input_nodes) {
    ws_->CreateTensor(MakeString("mace_input_node_", input_name),
@@ -158,7 +160,7 @@ MaceStatus MaceEngine::Impl::Init(
    MACE_RETURN_IF_ERROR(ws_->LoadModelTensor(
        *net_def, device_type_, model_data));
-  // Init model
+    // Init model
    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type_,
                         NetMode::INIT);
    MACE_RETURN_IF_ERROR(net->Run());
@@ -170,6 +172,7 @@ MaceStatus MaceEngine::Impl::Init(
 }
 MaceEngine::Impl::~Impl() {
+  LOG(INFO) << "Destroying MaceEngine";
 #ifdef MACE_ENABLE_HEXAGON
  if (device_type_ == HEXAGON) {
    if (VLOG_IS_ON(2)) {

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -45,6 +45,11 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
    return MaceStatus::MACE_SUCCESS;
  }
  VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
+  if (ShouldMockRuntimeFailure()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
  cl_int error;
  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
@@ -68,6 +73,10 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
          << image_shape[1];
+  if (ShouldMockRuntimeFailure()) {
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
+  }
  cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
  cl_int error;

--- a/mace/core/runtime_failure_mock.cc
+++ b/mace/core/runtime_failure_mock.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cstdlib>
+#include <string>
+#include "mace/core/runtime_failure_mock.h"
+#include "mace/utils/logging.h"
+namespace mace {
+namespace {
+inline float GetRuntimeFailureRatioFromEnv() {
+  const char *env = getenv("MACE_RUNTIME_FAILURE_RATIO");
+  if (env == nullptr) {
+    return 0;
+  }
+  std::string env_str(env);
+  std::istringstream ss(env_str);
+  float ratio;
+  ss >> ratio;
+  return ratio;
+}
+}  // namespace
+bool ShouldMockRuntimeFailure() {
+  static unsigned int seed = time(NULL);
+  static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv();
+  if (mock_runtime_failure_ratio > 1e-6) {
+    float random_ratio = rand_r(&seed) / static_cast<float>(RAND_MAX);
+    if (random_ratio < mock_runtime_failure_ratio) {
+      VLOG(0) << "Mock runtime failure.";
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace mace
--- a/mace/core/runtime_failure_mock.h
+++ b/mace/core/runtime_failure_mock.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_CORE_RUNTIME_FAILURE_MOCK_H_
+#define MACE_CORE_RUNTIME_FAILURE_MOCK_H_
+namespace mace {
+bool ShouldMockRuntimeFailure();
+}  // namespace mace
+#endif  // MACE_CORE_RUNTIME_FAILURE_MOCK_H_
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -120,45 +120,48 @@ struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  struct mallinfo curr = mallinfo();
  if (prev.arena != curr.arena) {
    LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena
-              << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
+              << ", diff: " << ((int64_t) curr.arena - (int64_t) prev.arena);
  }
  if (prev.ordblks != curr.ordblks) {
    LOG(INFO) << "Number of free chunks: " << curr.ordblks
-              << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
+              << ", diff: "
+              << ((int64_t) curr.ordblks - (int64_t) prev.ordblks);
  }
  if (prev.smblks != curr.smblks) {
    LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks
-              << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
+              << ", diff: " << ((int64_t) curr.smblks - (int64_t) prev.smblks);
  }
  if (prev.hblks != curr.hblks) {
    LOG(INFO) << "Number of mmapped regions: " << curr.hblks
-              << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
+              << ", diff: " << ((int64_t) curr.hblks - (int64_t) prev.hblks);
  }
  if (prev.hblkhd != curr.hblkhd) {
    LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
-              << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
+              << ", diff: " << ((int64_t) curr.hblkhd - (int64_t) prev.hblkhd);
  }
  if (prev.usmblks != curr.usmblks) {
    LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks
-              << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
+              << ", diff: "
+              << ((int64_t) curr.usmblks - (int64_t) prev.usmblks);
  }
  if (prev.fsmblks != curr.fsmblks) {
    LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
-              << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
+              << ", diff: "
+              << ((int64_t) curr.fsmblks - (int64_t) prev.fsmblks);
  }
  if (prev.uordblks != curr.uordblks) {
    LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks
              << ", diff: "
-              << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
+              << ((int64_t) curr.uordblks - (int64_t) prev.uordblks);
  }
  if (prev.fordblks != curr.fordblks) {
    LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: "
-              << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
+              << ((int64_t) curr.fordblks - (int64_t) prev.fordblks);
  }
  if (prev.keepcost != curr.keepcost) {
    LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost
              << ", diff: "
-              << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
+              << ((int64_t) curr.keepcost - (int64_t) prev.keepcost);
  }
  return curr;
 }
@@ -227,39 +230,48 @@ bool RunModel(const std::string &model_name,
      new FileStorageFactory(kernel_file_path));
  SetKVStorageFactory(storage_factory);
-  std::shared_ptr<mace::MaceEngine> engine;
+  std::vector<unsigned char> model_pb_data;
-  MaceStatus create_engine_status;
-  // Create Engine
-  int64_t t0 = NowMicros();
  if (FLAGS_model_file != "") {
-    std::vector<unsigned char> model_pb_data;
    if (!mace::ReadBinaryFile(&model_pb_data, FLAGS_model_file)) {
      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
    }
-    create_engine_status =
-        CreateMaceEngineFromProto(model_pb_data,
-                                  FLAGS_model_data_file,
-                                  input_names,
-                                  output_names,
-                                  device_type,
-                                  &engine);
-  } else {
-    create_engine_status =
-        CreateMaceEngineFromCode(model_name,
-                                 FLAGS_model_data_file,
-                                 input_names,
-                                 output_names,
-                                 device_type,
-                                 &engine);
  }
-  int64_t t1 = NowMicros();
-  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
+  std::shared_ptr<mace::MaceEngine> engine;
-    LOG(FATAL) << "Create engine error, please check the arguments";
+  MaceStatus create_engine_status;
-  }
-  double init_millis = (t1 - t0) / 1000.0;
+  double init_millis;
-  LOG(INFO) << "Total init latency: " << init_millis << " ms";
+  while (true) {
+    // Create Engine
+    int64_t t0 = NowMicros();
+    if (FLAGS_model_file != "") {
+      create_engine_status =
+          CreateMaceEngineFromProto(model_pb_data,
+                                    FLAGS_model_data_file,
+                                    input_names,
+                                    output_names,
+                                    device_type,
+                                    &engine);
+    } else {
+      create_engine_status =
+          CreateMaceEngineFromCode(model_name,
+                                   FLAGS_model_data_file,
+                                   input_names,
+                                   output_names,
+                                   device_type,
+                                   &engine);
+    }
+    int64_t t1 = NowMicros();
+    if (create_engine_status != MACE_SUCCESS) {
+      LOG(ERROR) << "Create engine runtime error, retry ... errcode: "
+                 << create_engine_status;
+    } else {
+      init_millis = (t1 - t0) / 1000.0;
+      LOG(INFO) << "Total init latency: " << init_millis << " ms";
+      break;
+    }
+  }
  const size_t input_count = input_names.size();
  const size_t output_count = output_names.size();
@@ -297,26 +309,84 @@ bool RunModel(const std::string &model_name,
  }
  LOG(INFO) << "Warm up run";
-  int64_t t3 = NowMicros();
+  double warmup_millis;
-  engine->Run(inputs, &outputs);
+  while (true) {
-  int64_t t4 = NowMicros();
+    int64_t t3 = NowMicros();
-  double warmup_millis = (t4 - t3) / 1000.0;
+    MaceStatus warmup_status = engine->Run(inputs, &outputs);
-  LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
+    if (warmup_status != MACE_SUCCESS) {
+      LOG(ERROR) << "Warmup runtime error, retry ... errcode: "
+                 << warmup_status;
+      do {
+        if (FLAGS_model_file != "") {
+          create_engine_status =
+              CreateMaceEngineFromProto(model_pb_data,
+                                        FLAGS_model_data_file,
+                                        input_names,
+                                        output_names,
+                                        device_type,
+                                        &engine);
+        } else {
+          create_engine_status =
+              CreateMaceEngineFromCode(model_name,
+                                       FLAGS_model_data_file,
+                                       input_names,
+                                       output_names,
+                                       device_type,
+                                       &engine);
+        }
+      } while (create_engine_status != MACE_SUCCESS);
+    } else {
+      int64_t t4 = NowMicros();
+      warmup_millis = (t4 - t3) / 1000.0;
+      LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
+      break;
+    }
+  }
  double model_run_millis = -1;
  if (FLAGS_round > 0) {
    LOG(INFO) << "Run model";
-    int64_t t0 = NowMicros();
+    int64_t total_run_duration = 0;
    struct mallinfo prev = mallinfo();
    for (int i = 0; i < FLAGS_round; ++i) {
-      engine->Run(inputs, &outputs);
+      MaceStatus run_status;
+      while (true) {
+        int64_t t0 = NowMicros();
+        run_status = engine->Run(inputs, &outputs);
+        if (run_status != MACE_SUCCESS) {
+          LOG(ERROR) << "Mace run model runtime error, retry ... errcode: "
+                     << run_status;
+          do {
+            if (FLAGS_model_file != "") {
+              create_engine_status =
+                  CreateMaceEngineFromProto(model_pb_data,
+                                            FLAGS_model_data_file,
+                                            input_names,
+                                            output_names,
+                                            device_type,
+                                            &engine);
+            } else {
+              create_engine_status =
+                  CreateMaceEngineFromCode(model_name,
+                                           FLAGS_model_data_file,
+                                           input_names,
+                                           output_names,
+                                           device_type,
+                                           &engine);
+            }
+          } while (create_engine_status != MACE_SUCCESS);
+        } else {
+          int64_t t1 = NowMicros();
+          total_run_duration += (t1 - t0);
+          break;
+        }
+      }
      if (FLAGS_malloc_check_cycle >= 1 && i % FLAGS_malloc_check_cycle == 0) {
        LOG(INFO) << "=== check malloc info change #" << i << " ===";
        prev = LogMallinfoChange(prev);
      }
    }
-    int64_t t1 = NowMicros();
+    model_run_millis = total_run_duration / 1000.0 / FLAGS_round;
-    model_run_millis = (t1 - t0) / 1000.0 / FLAGS_round;
    LOG(INFO) << "Average latency: " << model_run_millis << " ms";
  }

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -171,7 +171,8 @@ def tuning_run(target_abi,
               omp_num_threads=-1,
               cpu_affinity_policy=1,
               gpu_perf_hint=3,
-               gpu_priority_hint=3):
+               gpu_priority_hint=3,
+               runtime_failure_ratio=0.0):
    stdout = sh_commands.tuning_run(
        target_abi,
        serialno,
@@ -195,6 +196,7 @@ def tuning_run(target_abi,
        cpu_affinity_policy,
        gpu_perf_hint,
        gpu_priority_hint,
+        runtime_failure_ratio,
        valgrind=FLAGS.valgrind,
        valgrind_path=FLAGS.valgrind_path,
        valgrind_args=FLAGS.valgrind_args
@@ -543,6 +545,11 @@ def parse_args():
        type=str,
        default="half",
        help="[half | float].")
+    parser.add_argument(
+        "--runtime_failure_ratio",
+        type=float,
+        default=0.0,
+        help="[mock runtime failure ratio].")
    return parser.parse_known_args()
@@ -632,6 +639,11 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
           FLAGS.mode == "all":
+            if FLAGS.mode == "run":
+                runtime_failure_ratio = FLAGS.runtime_failure_ratio
+            else:
+                runtime_failure_ratio = 0.0
            tuning_run(target_abi,
                       serialno,
                       vlog_level,
@@ -651,7 +663,8 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                       omp_num_threads=FLAGS.omp_num_threads,
                       cpu_affinity_policy=FLAGS.cpu_affinity_policy,
                       gpu_perf_hint=FLAGS.gpu_perf_hint,
-                       gpu_priority_hint=FLAGS.gpu_priority_hint)
+                       gpu_priority_hint=FLAGS.gpu_priority_hint,
+                       runtime_failure_ratio=runtime_failure_ratio)
        if FLAGS.mode == "benchmark":
            gen_opencl_and_tuning_code(

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -597,6 +597,7 @@ def tuning_run(abi,
               cpu_affinity_policy=1,
               gpu_perf_hint=3,
               gpu_priority_hint=3,
+               runtime_failure_ratio=0.0,
               valgrind=False,
               valgrind_path="/data/local/tmp/valgrind",
               valgrind_args="",
@@ -617,6 +618,7 @@ def tuning_run(abi,
            [
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+                "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
                "%s/mace_run" % model_output_dir,
                "--model_name=%s" % model_tag,
                "--input_node=%s" % ",".join(input_nodes),
@@ -678,6 +680,7 @@ def tuning_run(abi,
            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % phone_data_dir,
            "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
+            "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
        ]
        if valgrind:
            adb_cmd.extend([