From 4f4882fa11d935ae1d44d13d83a586eec0c62952 Mon Sep 17 00:00:00 2001 From: luxuhui Date: Wed, 17 Apr 2019 19:02:57 +0800 Subject: [PATCH] opt the compile option and related code N/A Signed-off-by: Luxuhui --- mace/core/quantize.cc | 4 ++-- mace/core/runtime/opencl/opencl_allocator.cc | 6 +++++ mace/core/runtime/opencl/opencl_runtime.cc | 2 +- mace/libmace/mace.cc | 6 ++--- mace/ops/addn.cc | 2 -- mace/ops/common/transpose.h | 2 +- mace/ops/opencl/image/split.h | 2 +- mace/ops/ops_test_util.cc | 23 ++++++++++++++++---- mace/ops/ops_test_util.h | 15 ++++++++++--- mace/ops/strided_slice.cc | 2 +- tools/bazel.rc | 3 +++ 11 files changed, 49 insertions(+), 18 deletions(-) diff --git a/mace/core/quantize.cc b/mace/core/quantize.cc index 167c6da3..ec4c65ac 100644 --- a/mace/core/quantize.cc +++ b/mace/core/quantize.cc @@ -77,7 +77,7 @@ void QuantizeUtil::Dequantize(const uint8_t *input, thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { for (index_t i = start; i < end; i += step) { uint8x16_t vi = vld1q_u8(input + i * 16); - float32x4x4_t vo = { + float32x4x4_t vo = {{ vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), @@ -90,7 +90,7 @@ void QuantizeUtil::Dequantize(const uint8_t *input, vmulq_f32(vscale, vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), - }; + }}; vst1q_f32(output + i * 16, vo.val[0]); vst1q_f32(output + i * 16 + 4, vo.val[1]); vst1q_f32(output + i * 16 + 8, vo.val[2]); diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index e63b252b..0a42e295 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -93,6 +93,12 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector &image_shape, << image_shape[0] << ", " << image_shape[1] << "] failed because of " << OpenCLErrorToString(error); + // Many users have doubts at CL_INVALID_IMAGE_SIZE, add some tips. + if (error == CL_INVALID_IMAGE_SIZE) { + auto max_2d_size = opencl_runtime_->GetMaxImage2DSize(); + LOG(WARNING) << "The allowable OpenCL image size is: " + << max_2d_size[0] << "x" << max_2d_size[1]; + } delete cl_image; *result = nullptr; return MaceStatus::MACE_OUT_OF_RESOURCES; diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index bcf26498..e26c6048 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -713,7 +713,7 @@ std::vector OpenCLRuntime::GetMaxImage2DSize() { LOG(ERROR) << "error: " << OpenCLErrorToString(err); return {}; } - return {max_height, max_width}; + return {max_width, max_height}; } uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index c5e16b76..fe6ea488 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -391,12 +391,12 @@ class MaceEngine::Impl { std::unique_ptr ws_; std::unique_ptr net_; bool is_quantized_model_; -#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) - std::unique_ptr hexagon_controller_; -#endif std::map input_info_map_; std::map output_info_map_; std::unique_ptr thread_pool_; +#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) + std::unique_ptr hexagon_controller_; +#endif MACE_DISABLE_COPY_AND_ASSIGN(Impl); }; diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index ea6458d4..1f0fa7a1 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -29,8 +29,6 @@ namespace mace { namespace ops { -static constexpr int kCostPerGroup = 1024; - template class AddNOp; diff --git a/mace/ops/common/transpose.h b/mace/ops/common/transpose.h index 0c075185..b7b42490 100644 --- a/mace/ops/common/transpose.h +++ b/mace/ops/common/transpose.h @@ -132,7 +132,7 @@ inline void TransposeNCHWToNHWCC2(utils::ThreadPool *thread_pool, for (w = 0; w + 3 < width; w += 4) { float32x4_t vi0 = vld1q_f32(input + in_offset); float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); - float32x4x2_t vi = {vi0, vi1}; + float32x4x2_t vi = {{vi0, vi1}}; vst2q_f32(output + out_offset, vi); in_offset += 4; out_offset += 8; diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h index 04eaaa87..12755910 100644 --- a/mace/ops/opencl/image/split.h +++ b/mace/ops/opencl/image/split.h @@ -133,7 +133,7 @@ MaceStatus SplitKernel::Compute( } } if (context->future() != nullptr) { - context->future()->wait_fn = [runtime, call_stats](CallStats *stats) { + context->future()->wait_fn = [call_stats](CallStats *stats) { if (stats != nullptr) { stats->start_micros = call_stats.start_micros; stats->end_micros = stats->start_micros + call_stats.end_micros; diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc index ab61e8c6..bcf1282d 100644 --- a/mace/ops/ops_test_util.cc +++ b/mace/ops/ops_test_util.cc @@ -101,11 +101,13 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const { } namespace { +#ifdef MACE_ENABLE_OPENCL std::string GetStoragePathFromEnv() { char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH"); if (storage_path_str == nullptr) return ""; return storage_path_str; } +#endif } // namespace OpTestContext *OpTestContext::Get(int num_threads, @@ -117,15 +119,21 @@ OpTestContext *OpTestContext::Get(int num_threads, OpTestContext::OpTestContext(int num_threads, CPUAffinityPolicy cpu_affinity_policy) +#ifdef MACE_ENABLE_OPENCL : gpu_context_(std::make_shared(GetStoragePathFromEnv())), opencl_mem_types_({MemoryType::GPU_IMAGE}), thread_pool_(make_unique(num_threads, cpu_affinity_policy)) { +#else + : thread_pool_(make_unique(num_threads, + cpu_affinity_policy)) { +#endif thread_pool_->Init(); device_map_[DeviceType::CPU] = make_unique( num_threads, cpu_affinity_policy, thread_pool_.get()); +#ifdef MACE_ENABLE_OPENCL device_map_[DeviceType::GPU] = make_unique( gpu_context_->opencl_tuner(), gpu_context_->opencl_cache_storage(), @@ -135,16 +143,18 @@ OpTestContext::OpTestContext(int num_threads, num_threads, cpu_affinity_policy, thread_pool_.get()); -} - -std::shared_ptr OpTestContext::gpu_context() const { - return gpu_context_; +#endif } Device *OpTestContext::GetDevice(DeviceType device_type) { return device_map_[device_type].get(); } +#ifdef MACE_ENABLE_OPENCL +std::shared_ptr OpTestContext::gpu_context() const { + return gpu_context_; +} + std::vector OpTestContext::opencl_mem_types() { return opencl_mem_types_; } @@ -160,6 +170,7 @@ void OpTestContext::SetOCLImageTestFlag() { void OpTestContext::SetOCLImageAndBufferTestFlag() { opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER}; } +#endif // MACE_ENABLE_OPENCL bool OpsTestNet::Setup(mace::DeviceType device) { NetDef net_def; @@ -231,6 +242,7 @@ MaceStatus OpsTestNet::Run() { MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { if (device == DeviceType::GPU) { +#ifdef MACE_ENABLE_OPENCL auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); for (auto type : opencl_mem_types) { OpTestContext::Get()->GetDevice(device) @@ -239,6 +251,9 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { MACE_RETURN_IF_ERROR(Run()); } return MaceStatus::MACE_SUCCESS; +#else + return MaceStatus::MACE_UNSUPPORTED; +#endif // MACE_ENABLE_OPENCL } else { Setup(device); return Run(); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index e9ef4d90..d2212a65 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -29,8 +29,6 @@ #include "mace/core/types.h" #include "mace/core/net.h" #include "mace/core/device_context.h" -#include "mace/core/runtime/opencl/gpu_device.h" -#include "mace/core/runtime/opencl/opencl_util.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" #include "mace/ops/ops_registry.h" @@ -40,6 +38,11 @@ #include "mace/core/quantize.h" #include "mace/ops/testing/test_utils.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/gpu_device.h" +#include "mace/core/runtime/opencl/opencl_util.h" +#endif + namespace mace { namespace ops { namespace test { @@ -78,12 +81,14 @@ class OpTestContext { static OpTestContext *Get( int num_threads = -1, CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY); - std::shared_ptr gpu_context() const; Device *GetDevice(DeviceType device_type); +#ifdef MACE_ENABLE_OPENCL + std::shared_ptr gpu_context() const; std::vector opencl_mem_types(); void SetOCLBufferTestFlag(); void SetOCLImageTestFlag(); void SetOCLImageAndBufferTestFlag(); +#endif utils::ThreadPool *thread_pool() { return thread_pool_.get(); } @@ -93,8 +98,10 @@ class OpTestContext { CPUAffinityPolicy cpu_affinity_policy); MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); +#ifdef MACE_ENABLE_OPENCL std::shared_ptr gpu_context_; std::vector opencl_mem_types_; +#endif std::map> device_map_; std::unique_ptr thread_pool_; }; @@ -424,7 +431,9 @@ class OpsTestBase : public ::testing::Test { } virtual void TearDown() { +#ifdef MACE_ENABLE_OPENCL OpTestContext::Get()->SetOCLImageTestFlag(); +#endif } }; diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc index c10914f2..4218d1f7 100644 --- a/mace/ops/strided_slice.cc +++ b/mace/ops/strided_slice.cc @@ -199,7 +199,7 @@ class StridedSliceOp : public Operation { strides_indices_vec[d] > 0 ? 0 : -1, strides_indices_vec[d] > 0 ? dim_len : dim_len - 1}; - auto format_indices = [valid_range, d, dim_len](index_t indice) { + auto format_indices = [valid_range, dim_len](index_t indice) { index_t forward = indice < 0 ? indice + dim_len : indice; return Clamp(forward, valid_range[0], valid_range[1]); }; diff --git a/tools/bazel.rc b/tools/bazel.rc index 629129cf..f0c6d64f 100644 --- a/tools/bazel.rc +++ b/tools/bazel.rc @@ -19,6 +19,9 @@ build:android --linkopt=-lm build:android --distinct_host_configuration=true build:android --crosstool_top=//external:android/crosstool build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain +build:android --copt -Wall +build:android --copt -Wno-mismatched-tags +build:android --copt -Wno-missing-braces # Linux host build, --config linux build:linux --define linux=true -- GitLab