提交 4f4882fa 编写于 作者: L luxuhui

opt the compile option and related code

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 77df54f2
......@@ -77,7 +77,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
thread_pool_->Compute1D([=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) {
uint8x16_t vi = vld1q_u8(input + i * 16);
float32x4x4_t vo = {
float32x4x4_t vo = {{
vmulq_f32(vscale,
vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
vget_low_u16(vmovl_u8(vget_low_u8(vi))))), vzero))),
......@@ -90,7 +90,7 @@ void QuantizeUtil<uint8_t>::Dequantize(const uint8_t *input,
vmulq_f32(vscale,
vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(
vget_high_u16(vmovl_u8(vget_high_u8(vi))))), vzero))),
};
}};
vst1q_f32(output + i * 16, vo.val[0]);
vst1q_f32(output + i * 16 + 4, vo.val[1]);
vst1q_f32(output + i * 16 + 8, vo.val[2]);
......
......@@ -93,6 +93,12 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
<< image_shape[0] << ", " << image_shape[1]
<< "] failed because of "
<< OpenCLErrorToString(error);
// Many users have doubts at CL_INVALID_IMAGE_SIZE, add some tips.
if (error == CL_INVALID_IMAGE_SIZE) {
auto max_2d_size = opencl_runtime_->GetMaxImage2DSize();
LOG(WARNING) << "The allowable OpenCL image size is: "
<< max_2d_size[0] << "x" << max_2d_size[1];
}
delete cl_image;
*result = nullptr;
return MaceStatus::MACE_OUT_OF_RESOURCES;
......
......@@ -713,7 +713,7 @@ std::vector<uint64_t> OpenCLRuntime::GetMaxImage2DSize() {
LOG(ERROR) << "error: " << OpenCLErrorToString(err);
return {};
}
return {max_height, max_width};
return {max_width, max_height};
}
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
......
......@@ -391,12 +391,12 @@ class MaceEngine::Impl {
std::unique_ptr<Workspace> ws_;
std::unique_ptr<NetBase> net_;
bool is_quantized_model_;
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif
std::map<std::string, mace::InputOutputInfo> input_info_map_;
std::map<std::string, mace::InputOutputInfo> output_info_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_;
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
#endif
MACE_DISABLE_COPY_AND_ASSIGN(Impl);
};
......
......@@ -29,8 +29,6 @@
namespace mace {
namespace ops {
static constexpr int kCostPerGroup = 1024;
template <DeviceType D, class T>
class AddNOp;
......
......@@ -132,7 +132,7 @@ inline void TransposeNCHWToNHWCC2<float>(utils::ThreadPool *thread_pool,
for (w = 0; w + 3 < width; w += 4) {
float32x4_t vi0 = vld1q_f32(input + in_offset);
float32x4_t vi1 = vld1q_f32(input + in_offset + image_size);
float32x4x2_t vi = {vi0, vi1};
float32x4x2_t vi = {{vi0, vi1}};
vst2q_f32(output + out_offset, vi);
in_offset += 4;
out_offset += 8;
......
......@@ -133,7 +133,7 @@ MaceStatus SplitKernel<T>::Compute(
}
}
if (context->future() != nullptr) {
context->future()->wait_fn = [runtime, call_stats](CallStats *stats) {
context->future()->wait_fn = [call_stats](CallStats *stats) {
if (stats != nullptr) {
stats->start_micros = call_stats.start_micros;
stats->end_micros = stats->start_micros + call_stats.end_micros;
......
......@@ -101,11 +101,13 @@ void OpDefBuilder::Finalize(OperatorDef *op_def) const {
}
namespace {
#ifdef MACE_ENABLE_OPENCL
std::string GetStoragePathFromEnv() {
char *storage_path_str = getenv("MACE_INTERNAL_STORAGE_PATH");
if (storage_path_str == nullptr) return "";
return storage_path_str;
}
#endif
} // namespace
OpTestContext *OpTestContext::Get(int num_threads,
......@@ -117,15 +119,21 @@ OpTestContext *OpTestContext::Get(int num_threads,
OpTestContext::OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy)
#ifdef MACE_ENABLE_OPENCL
: gpu_context_(std::make_shared<GPUContext>(GetStoragePathFromEnv())),
opencl_mem_types_({MemoryType::GPU_IMAGE}),
thread_pool_(make_unique<utils::ThreadPool>(num_threads,
cpu_affinity_policy)) {
#else
: thread_pool_(make_unique<utils::ThreadPool>(num_threads,
cpu_affinity_policy)) {
#endif
thread_pool_->Init();
device_map_[DeviceType::CPU] = make_unique<CPUDevice>(
num_threads, cpu_affinity_policy, thread_pool_.get());
#ifdef MACE_ENABLE_OPENCL
device_map_[DeviceType::GPU] = make_unique<GPUDevice>(
gpu_context_->opencl_tuner(),
gpu_context_->opencl_cache_storage(),
......@@ -135,16 +143,18 @@ OpTestContext::OpTestContext(int num_threads,
num_threads,
cpu_affinity_policy,
thread_pool_.get());
}
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
#endif
}
Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get();
}
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> OpTestContext::gpu_context() const {
return gpu_context_;
}
std::vector<MemoryType> OpTestContext::opencl_mem_types() {
return opencl_mem_types_;
}
......@@ -160,6 +170,7 @@ void OpTestContext::SetOCLImageTestFlag() {
void OpTestContext::SetOCLImageAndBufferTestFlag() {
opencl_mem_types_ = {MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER};
}
#endif // MACE_ENABLE_OPENCL
bool OpsTestNet::Setup(mace::DeviceType device) {
NetDef net_def;
......@@ -231,6 +242,7 @@ MaceStatus OpsTestNet::Run() {
MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
if (device == DeviceType::GPU) {
#ifdef MACE_ENABLE_OPENCL
auto opencl_mem_types = OpTestContext::Get()->opencl_mem_types();
for (auto type : opencl_mem_types) {
OpTestContext::Get()->GetDevice(device)
......@@ -239,6 +251,9 @@ MaceStatus OpsTestNet::RunOp(mace::DeviceType device) {
MACE_RETURN_IF_ERROR(Run());
}
return MaceStatus::MACE_SUCCESS;
#else
return MaceStatus::MACE_UNSUPPORTED;
#endif // MACE_ENABLE_OPENCL
} else {
Setup(device);
return Run();
......
......@@ -29,8 +29,6 @@
#include "mace/core/types.h"
#include "mace/core/net.h"
#include "mace/core/device_context.h"
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_util.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/ops/ops_registry.h"
......@@ -40,6 +38,11 @@
#include "mace/core/quantize.h"
#include "mace/ops/testing/test_utils.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/opencl_util.h"
#endif
namespace mace {
namespace ops {
namespace test {
......@@ -78,12 +81,14 @@ class OpTestContext {
static OpTestContext *Get(
int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY);
std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type);
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context() const;
std::vector<MemoryType> opencl_mem_types();
void SetOCLBufferTestFlag();
void SetOCLImageTestFlag();
void SetOCLImageAndBufferTestFlag();
#endif
utils::ThreadPool *thread_pool() {
return thread_pool_.get();
}
......@@ -93,8 +98,10 @@ class OpTestContext {
CPUAffinityPolicy cpu_affinity_policy);
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
#ifdef MACE_ENABLE_OPENCL
std::shared_ptr<GPUContext> gpu_context_;
std::vector<MemoryType> opencl_mem_types_;
#endif
std::map<DeviceType, std::unique_ptr<Device>> device_map_;
std::unique_ptr<utils::ThreadPool> thread_pool_;
};
......@@ -424,7 +431,9 @@ class OpsTestBase : public ::testing::Test {
}
virtual void TearDown() {
#ifdef MACE_ENABLE_OPENCL
OpTestContext::Get()->SetOCLImageTestFlag();
#endif
}
};
......
......@@ -199,7 +199,7 @@ class StridedSliceOp : public Operation {
strides_indices_vec[d] > 0 ? 0 : -1,
strides_indices_vec[d] > 0 ? dim_len : dim_len - 1};
auto format_indices = [valid_range, d, dim_len](index_t indice) {
auto format_indices = [valid_range, dim_len](index_t indice) {
index_t forward = indice < 0 ? indice + dim_len : indice;
return Clamp(forward, valid_range[0], valid_range[1]);
};
......
......@@ -19,6 +19,9 @@ build:android --linkopt=-lm
build:android --distinct_host_configuration=true
build:android --crosstool_top=//external:android/crosstool
build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
build:android --copt -Wall
build:android --copt -Wno-mismatched-tags
build:android --copt -Wno-missing-braces
# Linux host build, --config linux
build:linux --define linux=true
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册