提交 9b0b03c9 编写于 作者: 刘琦

Merge branch 'darwin_copt' into 'master'

opt the compile option  and related code

See merge request !1068
...@@ -77,7 +77,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input, ...@@ -77,7 +77,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) { thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
uint8x16_t vi = vld1q_u8(input + i * 16); uint8x16_t vi = vld1q_u8(input + i * 16);
float32x4x4_t vo = { float32x4x4_t vo = {{
vmulq_f32(vscale, vmulq_f32(vscale,
vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))), vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
...@@ -90,7 +90,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input, ...@@ -90,7 +90,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
vmulq_f32(vscale, vmulq_f32(vscale,
vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16( vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))), vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
}; }};
vst1q_f32(output + i * 16, vo.val[0]); vst1q_f32(output + i * 16, vo.val[0]);
vst1q_f32(output + i * 16 + 4, vo.val[1]); vst1q_f32(output + i * 16 + 4, vo.val[1]);
vst1q_f32(output + i * 16 + 8, vo.val[2]); vst1q_f32(output + i * 16 + 8, vo.val[2]);
......
...@@ -93,6 +93,12 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -93,6 +93,12 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
<< image_shape[0] << ", " << image_shape[1] << image_shape[0] << ", " << image_shape[1]
<< "] failed because of " << "] failed because of "
<< OpenCLErrorToString(error); << OpenCLErrorToString(error);
// Many users have doubts at CL_INVALID_IMAGE_SIZE, add some tips.
if (error == CL_INVALID_IMAGE_SIZE) {
auto max_2d_size = opencl_runtime_->GetMaxImage2DSize();
LOG(WARNING) << "The allowable OpenCL image size is: "
<< max_2d_size[0] << "x" << max_2d_size[1];
}
delete cl_image; delete cl_image;
*result = nullptr; *result = nullptr;
return MaceStatus::MACE_OUT_OF_RESOURCES; return MaceStatus::MACE_OUT_OF_RESOURCES;
......
...@@ -713,7 +713,7 @@ std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() { ...@@ -713,7 +713,7 @@ std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
LOG(ERROR) << "error: " << OpenCLErrorToString(err); LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {}; return {};
} }
return {max_height, max_width}; return {max_width, max_height};
} }
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
......
...@@ -391,12 +391,12 @@ class MaceEngine::Impl { ...@@ -391,12 +391,12 @@ class MaceEngine::Impl {
std::unique_ptr<Workspace> ws_; std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_; std::unique_ptr<NetBase> net_;
bool is_quantized_model_; bool is_quantized_model_;
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif
std::map<std::string, mace::InputOutputInfo> input_info_map_; std::map<std::string, mace::InputOutputInfo> input_info_map_;
std::map<std::string, mace::InputOutputInfo> output_info_map_; std::map<std::string, mace::InputOutputInfo> output_info_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_; std::unique_ptr<utils::ThreadPool> thread_pool_;
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif
MACE_DISABLE_COPY_AND_ASSIGN(Impl); MACE_DISABLE_COPY_AND_ASSIGN(Impl);
}; };
......
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
static constexpr int kCostPerGroup = 1024;
template <DeviceType D, class T> template <DeviceType D, class T>
class AddNOp; class AddNOp;
......
...@@ -132,7 +132,7 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool, ...@@ -132,7 +132,7 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
for (w = 0; w + 3 < width; w += 4) { for (w = 0; w + 3 < width; w += 4) {
float32x4_t vi0 = vld1q_f32(input + in_offset); float32x4_t vi0 = vld1q_f32(input + in_offset);
float32x4_t vi1 = vld1q_f32(input + in_offset + image_size); float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
float32x4x2_t vi = {vi0, vi1}; float32x4x2_t vi = {{vi0, vi1}};
vst2q_f32(output + out_offset, vi); vst2q_f32(output + out_offset, vi);
in_offset += 4; in_offset += 4;
out_offset += 8; out_offset += 8;
......
...@@ -133,7 +133,7 @@ MaceStatus SplitKernel<T>::Compute( ...@@ -133,7 +133,7 @@ MaceStatus SplitKernel<T>::Compute(
} }
} }
if (context->future() != nullptr) { if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, call_stats](CallStats *stats) { context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) { if (stats != nullptr) {
stats->start_micros = call_stats.start_micros; stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros; stats->end_micros = stats->start_micros + call_stats.end_micros;
......
...@@ -101,11 +101,13 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const { ...@@ -101,11 +101,13 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const {
} }
namespace { namespace {
#ifdef MACE_ENABLE_OPENCL
std::string GetStoragePathFromEnv() { std::string GetStoragePathFromEnv() {
char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH"); char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH");
if (storage_path_str == nullptr) return ""; if (storage_path_str == nullptr) return "";
return storage_path_str; return storage_path_str;
} }
#endif
} // namespace } // namespace
OpTestContext *OpTestContext::Get(int num_threads, OpTestContext *OpTestContext::Get(int num_threads,
...@@ -117,15 +119,21 @@ OpTestContext *OpTestContext::Get(int num_threads, ...@@ -117,15 +119,21 @@ OpTestContext *OpTestContext::Get(int num_threads,
OpTestContext::OpTestContext(int num_threads, OpTestContext::OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy) CPUAffinityPolicy cpu_affinity_policy)
#ifdef MACE_ENABLE_OPENCL
: gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())), : gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())),
opencl_mem_types_({MemoryType::GPU_IMAGE}), opencl_mem_types_({MemoryType::GPU_IMAGE}),
thread_pool_(make_unique<utils::ThreadPool>(num_threads, thread_pool_(make_unique<utils::ThreadPool>(num_threads,
cpu_affinity_policy)) { cpu_affinity_policy)) {
#else
: thread_pool_(make_unique<utils::ThreadPool>(num_threads,
cpu_affinity_policy)) {
#endif
thread_pool_->Init(); thread_pool_->Init();
device_map_[DeviceType::CPU] = make_unique<CPUDevice>( device_map_[DeviceType::CPU] = make_unique<CPUDevice>(
num_threads, cpu_affinity_policy, thread_pool_.get()); num_threads, cpu_affinity_policy, thread_pool_.get());
#ifdef MACE_ENABLE_OPENCL
device_map_[DeviceType::GPU] = make_unique<GPUDevice>( device_map_[DeviceType::GPU] = make_unique<GPUDevice>(
gpu_context_->opencl_tuner(), gpu_context_->opencl_tuner(),
gpu_context_->opencl_cache_storage(), gpu_context_->opencl_cache_storage(),
...@@ -135,16 +143,18 @@ OpTestContext::OpTestContext(int num_threads, ...@@ -135,16 +143,18 @@ OpTestContext::OpTestContext(int num_threads,
num_threads, num_threads,
cpu_affinity_policy, cpu_affinity_policy,
thread_pool_.get()); thread_pool_.get());
} #endif
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
} }
Device *OpTestContext::GetDevice(DeviceType device_type) { Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get(); return device_map_[device_type].get();
} }
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
}
std::vector<MemoryType> OpTestContext::opencl_mem_types() { std::vector<MemoryType> OpTestContext::opencl_mem_types() {
return opencl_mem_types_; return opencl_mem_types_;
} }
...@@ -160,6 +170,7 @@ void OpTestContext::SetOCLImageTestFlag() { ...@@ -160,6 +170,7 @@ void OpTestContext::SetOCLImageTestFlag() {
void OpTestContext::SetOCLImageAndBufferTestFlag() { void OpTestContext::SetOCLImageAndBufferTestFlag() {
opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER}; opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
} }
#endif // MACE_ENABLE_OPENCL
bool OpsTestNet::Setup(mace::DeviceType device) { bool OpsTestNet::Setup(mace::DeviceType device) {
NetDef net_def; NetDef net_def;
...@@ -231,6 +242,7 @@ MaceStatus OpsTestNet::Run() { ...@@ -231,6 +242,7 @@ MaceStatus OpsTestNet::Run() {
MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
if (device == DeviceType::GPU) { if (device == DeviceType::GPU) {
#ifdef MACE_ENABLE_OPENCL
auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types(); auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
for (auto type : opencl_mem_types) { for (auto type : opencl_mem_types) {
OpTestContext::Get()->GetDevice(device) OpTestContext::Get()->GetDevice(device)
...@@ -239,6 +251,9 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) { ...@@ -239,6 +251,9 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
MACE_RETURN_IF_ERROR(Run()); MACE_RETURN_IF_ERROR(Run());
} }
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
#else
return MaceStatus::MACE_UNSUPPORTED;
#endif // MACE_ENABLE_OPENCL
} else { } else {
Setup(device); Setup(device);
return Run(); return Run();
......
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/device_context.h" #include "mace/core/device_context.h"
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_util.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/ops/ops_registry.h" #include "mace/ops/ops_registry.h"
...@@ -40,6 +38,11 @@ ...@@ -40,6 +38,11 @@
#include "mace/core/quantize.h" #include "mace/core/quantize.h"
#include "mace/ops/testing/test_utils.h" #include "mace/ops/testing/test_utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_util.h"
#endif
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace test { namespace test {
...@@ -78,12 +81,14 @@ class OpTestContext { ...@@ -78,12 +81,14 @@ class OpTestContext {
static OpTestContext *Get( static OpTestContext *Get(
int num_threads = -1, int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY); CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY);
std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type); Device *GetDevice(DeviceType device_type);
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context() const;
std::vector<MemoryType> opencl_mem_types(); std::vector<MemoryType> opencl_mem_types();
void SetOCLBufferTestFlag(); void SetOCLBufferTestFlag();
void SetOCLImageTestFlag(); void SetOCLImageTestFlag();
void SetOCLImageAndBufferTestFlag(); void SetOCLImageAndBufferTestFlag();
#endif
utils::ThreadPool *thread_pool() { utils::ThreadPool *thread_pool() {
return thread_pool_.get(); return thread_pool_.get();
} }
...@@ -93,8 +98,10 @@ class OpTestContext { ...@@ -93,8 +98,10 @@ class OpTestContext {
CPUAffinityPolicy cpu_affinity_policy); CPUAffinityPolicy cpu_affinity_policy);
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context_; std::shared_ptr<GPUContext> gpu_context_;
std::vector<MemoryType> opencl_mem_types_; std::vector<MemoryType> opencl_mem_types_;
#endif
std::map<DeviceType, std::unique_ptr<Device>> device_map_; std::map<DeviceType, std::unique_ptr<Device>> device_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_; std::unique_ptr<utils::ThreadPool> thread_pool_;
}; };
...@@ -424,7 +431,9 @@ class OpsTestBase : public ::testing::Test { ...@@ -424,7 +431,9 @@ class OpsTestBase : public ::testing::Test {
} }
virtual void TearDown() { virtual void TearDown() {
#ifdef MACE_ENABLE_OPENCL
OpTestContext::Get()->SetOCLImageTestFlag(); OpTestContext::Get()->SetOCLImageTestFlag();
#endif
} }
}; };
......
...@@ -199,7 +199,7 @@ class StridedSliceOp : public Operation { ...@@ -199,7 +199,7 @@ class StridedSliceOp : public Operation {
strides_indices_vec[d] > 0 ? 0 : -1, strides_indices_vec[d] > 0 ? 0 : -1,
strides_indices_vec[d] > 0 ? dim_len : dim_len - 1}; strides_indices_vec[d] > 0 ? dim_len : dim_len - 1};
auto format_indices = [valid_range, d, dim_len](index_t indice) { auto format_indices = [valid_range, dim_len](index_t indice) {
index_t forward = indice < 0 ? indice + dim_len : indice; index_t forward = indice < 0 ? indice + dim_len : indice;
return Clamp(forward, valid_range[0], valid_range[1]); return Clamp(forward, valid_range[0], valid_range[1]);
}; };
......
...@@ -19,6 +19,9 @@ build:android --linkopt=-lm ...@@ -19,6 +19,9 @@ build:android --linkopt=-lm
build:android --distinct_host_configuration=true build:android --distinct_host_configuration=true
build:android --crosstool_top=//external:android/crosstool build:android --crosstool_top=//external:android/crosstool
build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
build:android --copt -Wall
build:android --copt -Wno-mismatched-tags
build:android --copt -Wno-missing-braces
# Linux host build, --config linux # Linux host build, --config linux
build:linux --define linux=true build:linux --define linux=true
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册