Merge branch 'darwin_copt' into 'master'

opt the compile option and related code See merge request !1068

Merge branch 'darwin_copt' into 'master'
opt the compile option and related code See merge request !1068
9b0b03c9 · 刘琦 · bde945cd · 4f4882fa · 9b0b03c9 · 9b0b03c9
11 changed file
--- a/mace/core/quantize.cc
+++ b/mace/core/quantize.cc
@@ -77,7 +77,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
  thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
    for (index_t i = start; i < end; i += step) {
      uint8x16_t vi = vld1q_u8(input + i * 16);
-      float32x4x4_t vo = {
+      float32x4x4_t vo = {{
          vmulq_f32(vscale,
                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
                        vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
@@ -90,7 +90,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
          vmulq_f32(vscale,
                    vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
                        vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
-      };
+      }};
      vst1q_f32(output + i * 16, vo.val[0]);
      vst1q_f32(output + i * 16 + 4, vo.val[1]);
      vst1q_f32(output + i * 16 + 8, vo.val[2]);

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -93,6 +93,12 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                 << image_shape[0] << ", " << image_shape[1]
                 << "] failed because of "
                 << OpenCLErrorToString(error);
+    // Many users have doubts at CL_INVALID_IMAGE_SIZE, add some tips.
+    if (error == CL_INVALID_IMAGE_SIZE) {
+      auto max_2d_size = opencl_runtime_->GetMaxImage2DSize();
+      LOG(WARNING) << "The allowable OpenCL image size is: "
+                   << max_2d_size[0] << "x" << max_2d_size[1];
+    }
    delete cl_image;
    *result = nullptr;
    return MaceStatus::MACE_OUT_OF_RESOURCES;

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -713,7 +713,7 @@ std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
    LOG(ERROR) << "error: " << OpenCLErrorToString(err);
    return {};
  }
-  return {max_height, max_width};
+  return {max_width, max_height};
 }
 uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -391,12 +391,12 @@ class MaceEngine::Impl {
  std::unique_ptr<Workspace> ws_;
  std::unique_ptr<NetBase> net_;
  bool is_quantized_model_;
-#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
-  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
-#endif
  std::map<std::string, mace::InputOutputInfo> input_info_map_;
  std::map<std::string, mace::InputOutputInfo> output_info_map_;
  std::unique_ptr<utils::ThreadPool> thread_pool_;
+#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
+  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
+#endif
  MACE_DISABLE_COPY_AND_ASSIGN(Impl);
 };

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -29,8 +29,6 @@
 namespace mace {
 namespace ops {
-static constexpr int kCostPerGroup = 1024;
 template <DeviceType D, class T>
 class AddNOp;

--- a/mace/ops/common/transpose.h
+++ b/mace/ops/common/transpose.h
@@ -132,7 +132,7 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
      for (w = 0; w + 3 < width; w += 4) {
        float32x4_t vi0 = vld1q_f32(input + in_offset);
        float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
-        float32x4x2_t vi = {vi0, vi1};
+        float32x4x2_t vi = {{vi0, vi1}};
        vst2q_f32(output + out_offset, vi);
        in_offset += 4;
        out_offset += 8;

--- a/mace/ops/opencl/image/split.h
+++ b/mace/ops/opencl/image/split.h
@@ -133,7 +133,7 @@ MaceStatus SplitKernel<T>::Compute(
    }
  }
  if (context->future() != nullptr) {
-    context->future()->wait_fn = [runtime, call_stats](CallStats *stats) {
+    context->future()->wait_fn = [call_stats](CallStats *stats) {
      if (stats != nullptr) {
        stats->start_micros = call_stats.start_micros;
        stats->end_micros = stats->start_micros + call_stats.end_micros;

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -101,11 +101,13 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const {
 }
 namespace {
+#ifdef MACE_ENABLE_OPENCL
 std::string GetStoragePathFromEnv() {
  char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH");
  if (storage_path_str == nullptr) return "";
  return storage_path_str;
 }
+#endif
 }  // namespace
 OpTestContext *OpTestContext::Get(int num_threads,
@@ -117,15 +119,21 @@ OpTestContext *OpTestContext::Get(int num_threads,
 OpTestContext::OpTestContext(int num_threads,
                             CPUAffinityPolicy cpu_affinity_policy)
+#ifdef MACE_ENABLE_OPENCL
    : gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())),
      opencl_mem_types_({MemoryType::GPU_IMAGE}),
      thread_pool_(make_unique<utils::ThreadPool>(num_threads,
                                                  cpu_affinity_policy)) {
+#else
+    : thread_pool_(make_unique<utils::ThreadPool>(num_threads,
+                                                  cpu_affinity_policy)) {
+#endif
  thread_pool_->Init();
  device_map_[DeviceType::CPU] = make_unique<CPUDevice>(
      num_threads, cpu_affinity_policy, thread_pool_.get());
+#ifdef MACE_ENABLE_OPENCL
  device_map_[DeviceType::GPU] = make_unique<GPUDevice>(
      gpu_context_->opencl_tuner(),
      gpu_context_->opencl_cache_storage(),
@@ -135,16 +143,18 @@ OpTestContext::OpTestContext(int num_threads,
      num_threads,
      cpu_affinity_policy,
      thread_pool_.get());
-}
+#endif
-std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
-  return gpu_context_;
 }
 Device *OpTestContext::GetDevice(DeviceType device_type) {
  return device_map_[device_type].get();
 }
+#ifdef MACE_ENABLE_OPENCL
+std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
+  return gpu_context_;
+}
 std::vector<MemoryType> OpTestContext::opencl_mem_types() {
  return opencl_mem_types_;
 }
@@ -160,6 +170,7 @@ void OpTestContext::SetOCLImageTestFlag() {
 void OpTestContext::SetOCLImageAndBufferTestFlag() {
  opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
 }
+#endif  // MACE_ENABLE_OPENCL
 bool OpsTestNet::Setup(mace::DeviceType device) {
  NetDef net_def;
@@ -231,6 +242,7 @@ MaceStatus OpsTestNet::Run() {
 MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
  if (device == DeviceType::GPU) {
+#ifdef MACE_ENABLE_OPENCL
    auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
    for (auto type : opencl_mem_types) {
      OpTestContext::Get()->GetDevice(device)
@@ -239,6 +251,9 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
      MACE_RETURN_IF_ERROR(Run());
    }
    return MaceStatus::MACE_SUCCESS;
+#else
+    return MaceStatus::MACE_UNSUPPORTED;
+#endif  // MACE_ENABLE_OPENCL
  } else {
    Setup(device);
    return Run();

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -29,8 +29,6 @@
 #include "mace/core/types.h"
 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
-#include "mace/core/runtime/opencl/gpu_device.h"
-#include "mace/core/runtime/opencl/opencl_util.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/ops/ops_registry.h"
@@ -40,6 +38,11 @@
 #include "mace/core/quantize.h"
 #include "mace/ops/testing/test_utils.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/gpu_device.h"
+#include "mace/core/runtime/opencl/opencl_util.h"
+#endif
 namespace mace {
 namespace ops {
 namespace test {
@@ -78,12 +81,14 @@ class OpTestContext {
  static OpTestContext *Get(
      int num_threads = -1,
      CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY);
-  std::shared_ptr<GPUContext> gpu_context() const;
  Device *GetDevice(DeviceType device_type);
+#ifdef MACE_ENABLE_OPENCL
+  std::shared_ptr<GPUContext> gpu_context() const;
  std::vector<MemoryType> opencl_mem_types();
  void SetOCLBufferTestFlag();
  void SetOCLImageTestFlag();
  void SetOCLImageAndBufferTestFlag();
+#endif
  utils::ThreadPool *thread_pool() {
    return thread_pool_.get();
  }
@@ -93,8 +98,10 @@ class OpTestContext {
                CPUAffinityPolicy cpu_affinity_policy);
  MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
+#ifdef MACE_ENABLE_OPENCL
  std::shared_ptr<GPUContext> gpu_context_;
  std::vector<MemoryType> opencl_mem_types_;
+#endif
  std::map<DeviceType, std::unique_ptr<Device>> device_map_;
  std::unique_ptr<utils::ThreadPool> thread_pool_;
 };
@@ -424,7 +431,9 @@ class OpsTestBase : public ::testing::Test {
  }
  virtual void TearDown() {
+#ifdef MACE_ENABLE_OPENCL
    OpTestContext::Get()->SetOCLImageTestFlag();
+#endif
  }
 };

--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -199,7 +199,7 @@ class StridedSliceOp : public Operation {
            strides_indices_vec[d] > 0 ? 0 : -1,
            strides_indices_vec[d] > 0 ? dim_len : dim_len - 1};
-        auto format_indices = [valid_range, d, dim_len](index_t indice) {
+        auto format_indices = [valid_range, dim_len](index_t indice) {
          index_t forward = indice < 0 ? indice + dim_len : indice;
          return Clamp(forward, valid_range[0], valid_range[1]);
        };

--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -19,6 +19,9 @@ build:android --linkopt=-lm
 build:android --distinct_host_configuration=true
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android --copt -Wall
+build:android --copt -Wno-mismatched-tags
+build:android --copt -Wno-missing-braces
 # Linux host build, --config linux
 build:linux --define linux=true