提交 28954099 编写于 作者: B Bin Li 提交者: luxuhui

Support HTA custom API

上级 f4061000
......@@ -28,21 +28,37 @@ cc_library(
"*.cc",
"runtime/cpu/*.cc",
],
exclude = [
"rpcmem.cc",
],
) + if_opencl_enabled(glob(
[
"runtime/opencl/*.cc",
],
)) + if_hexagon_enabled([
)) + if_hexagon_or_hta_enabled([
"runtime/hexagon/hexagon_allocator.h",
"runtime/hexagon/hexagon_allocator.cc",
"runtime/hexagon/hexagon_device.cc",
]) + if_hexagon_enabled([
"runtime/hexagon/hexagon_dsp_wrapper.cc",
]) + if_hta_enabled([
"runtime/hexagon/hexagon_hta_transformer.h",
"runtime/hexagon/hexagon_hta_transformer.cc",
"runtime/hexagon/hexagon_hta_wrapper.cc",
]) + if_apu_enabled(glob([
"runtime/apu/*.cc",
])),
hdrs = glob([
"*.h",
"runtime/cpu/*.h",
]) + if_opencl_enabled(glob([
])) + if_rpcmem_enabled([
"rpcmem.cc",
]),
hdrs = glob(
[
"*.h",
"runtime/cpu/*.h",
],
exclude = [
"rpcmem.h",
],
) + if_opencl_enabled(glob([
"runtime/opencl/*.h",
])) + if_hexagon_or_hta_enabled(glob([
"runtime/hexagon/hexagon_control_wrapper.h",
......@@ -53,7 +69,9 @@ cc_library(
"runtime/hexagon/*hta*.h",
])) + if_apu_enabled(glob([
"runtime/apu/*.h"
])),
])) + if_rpcmem_enabled([
"rpcmem.h",
]),
copts = [
"-Werror",
"-Wextra",
......
......@@ -17,11 +17,17 @@ set(CORE_SRCS
runtime/cpu/cpu_runtime.cc
)
if(MACE_ENABLE_RPCMEM)
set(CORE_SRCS ${CORE_SRCS} rpcmem.cc)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
endif(MACE_ENABLE_RPCMEM)
if(MACE_ENABLE_OPENCL)
set(CORE_SRCS ${CORE_SRCS}
runtime/opencl/gpu_device.cc
runtime/opencl/gpu_runtime.cc
runtime/opencl/opencl_allocator.cc
runtime/opencl/opencl_helper.cc
runtime/opencl/opencl_runtime.cc
runtime/opencl/opencl_util.cc
runtime/opencl/opencl_wrapper.cc
......@@ -30,12 +36,20 @@ if(MACE_ENABLE_OPENCL)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} generated_opencl_kernel dl)
endif(MACE_ENABLE_OPENCL)
if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
set(CORE_SRCS ${CORE_SRCS}
runtime/hexagon/hexagon_allocator.cc
runtime/hexagon/hexagon_device.cc
)
endif(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA)
if(MACE_ENABLE_HEXAGON_DSP)
set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_dsp_wrapper.cc)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hexagon_controller)
endif(MACE_ENABLE_HEXAGON_DSP)
if(MACE_ENABLE_HEXAGON_HTA)
set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_transformer.cc)
set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_wrapper.cc)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_hexagon_runtime)
endif(MACE_ENABLE_HEXAGON_HTA)
......@@ -45,10 +59,6 @@ if(MACE_ENABLE_MTK_APU)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} apu-frontend)
endif(MACE_ENABLE_MTK_APU)
if(MACE_ENABLE_RPCMEM)
set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
endif(MACE_ENABLE_RPCMEM)
add_library(core STATIC ${CORE_SRCS})
target_link_libraries(core PRIVATE
proto
......
......@@ -16,6 +16,54 @@
namespace mace {
MaceStatus Allocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt,
void **result) {
MACE_UNUSED(image_shape);
MACE_UNUSED(dt);
MACE_UNUSED(result);
MACE_NOT_IMPLEMENTED;
return MaceStatus::MACE_SUCCESS;
}
void Allocator::DeleteImage(void *data) {
MACE_UNUSED(data);
MACE_NOT_IMPLEMENTED;
}
void *Allocator::Map(void *buffer,
size_t offset,
size_t nbytes,
bool finish_cmd_queue) {
MACE_UNUSED(nbytes);
MACE_UNUSED(finish_cmd_queue);
return reinterpret_cast<char*>(buffer) + offset;
}
void *Allocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch,
bool finish_cmd_queue) {
MACE_UNUSED(buffer);
MACE_UNUSED(image_shape);
MACE_UNUSED(mapped_image_pitch);
MACE_UNUSED(finish_cmd_queue);
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void Allocator::Unmap(void *buffer, void *mapper_ptr) {
MACE_UNUSED(buffer);
MACE_UNUSED(mapper_ptr);
}
#ifdef MACE_ENABLE_RPCMEM
Rpcmem *Allocator::rpcmem() {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
#endif // MACE_ENABLE_RPCMEM
Allocator *GetCPUAllocator() {
static CPUAllocator allocator;
return &allocator;
......
......@@ -27,6 +27,10 @@
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
#ifdef MACE_ENABLE_RPCMEM
#include "mace/core/rpcmem.h"
#endif // MACE_ENABLE_RPCMEM
namespace mace {
#if defined(__hexagon__)
......@@ -50,19 +54,22 @@ class Allocator {
virtual MaceStatus New(size_t nbytes, void **result) = 0;
virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
const DataType dt,
void **result) = 0;
void **result);
virtual void Delete(void *data) = 0;
virtual void DeleteImage(void *data) = 0;
virtual void DeleteImage(void *data);
virtual void *Map(void *buffer,
size_t offset,
size_t nbytes,
bool finish_cmd_queue) const = 0;
bool finish_cmd_queue);
virtual void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch,
bool finish_cmd_queue) const = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
bool finish_cmd_queue);
virtual void Unmap(void *buffer, void *mapper_ptr);
virtual bool OnHost() const = 0;
#ifdef MACE_ENABLE_RPCMEM
virtual Rpcmem *rpcmem();
#endif // MACE_ENABLE_RPCMEM
};
class CPUAllocator : public Allocator {
......@@ -84,46 +91,12 @@ class CPUAllocator : public Allocator {
return MaceStatus::MACE_SUCCESS;
}
MaceStatus NewImage(const std::vector<size_t> &shape,
const DataType dt,
void **result) override {
MACE_UNUSED(shape);
MACE_UNUSED(dt);
MACE_UNUSED(result);
LOG(FATAL) << "Allocate CPU image";
return MaceStatus::MACE_SUCCESS;
}
void Delete(void *data) override {
MACE_CHECK_NOTNULL(data);
VLOG(3) << "Free CPU buffer";
free(data);
}
void DeleteImage(void *data) override {
LOG(FATAL) << "Free CPU image";
free(data);
};
void *Map(void *buffer,
size_t offset,
size_t nbytes,
bool finish_cmd_queue) const override {
MACE_UNUSED(nbytes);
MACE_UNUSED(finish_cmd_queue);
return reinterpret_cast<char*>(buffer) + offset;
}
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch,
bool finish_cmd_queue) const override {
MACE_UNUSED(image_shape);
MACE_UNUSED(mapped_image_pitch);
MACE_UNUSED(finish_cmd_queue);
return buffer;
}
void Unmap(void *buffer, void *mapper_ptr) const override {
MACE_UNUSED(buffer);
MACE_UNUSED(mapper_ptr);
}
bool OnHost() const override { return true; }
};
......
......@@ -117,9 +117,12 @@ inline void GetOutputMultiplierAndShift(
template<typename F, typename Q>
class QuantizeUtil {
public:
QuantizeUtil() = default;
explicit QuantizeUtil(utils::ThreadPool *thread_pool)
: thread_pool_(thread_pool) {}
void Init(utils::ThreadPool *thread_pool) { thread_pool_ = thread_pool; }
void QuantizeWithScaleAndZeropoint(const float *input,
const index_t size,
float scale,
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/rpcmem.h"
#include "mace/utils/logging.h"
namespace mace {
Rpcmem::Rpcmem() {
rpcmem_init(&rm);
MACE_CHECK(rm.flag == 0, "rpcmem_init failed!");
}
Rpcmem::~Rpcmem() {
rpcmem_deinit(&rm);
}
void *Rpcmem::New(int heapid, uint32_t flags, int nbytes) {
return rpcmem_alloc(&rm, heapid, flags, nbytes);
}
void *Rpcmem::New(int nbytes) {
return New(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED, nbytes);
}
void Rpcmem::Delete(void *data) {
rpcmem_free(&rm, data);
}
int Rpcmem::ToFd(void *data) {
return rpcmem_to_fd(&rm, data);
}
int Rpcmem::SyncCacheStart(void *data) {
return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_START);
}
int Rpcmem::SyncCacheEnd(void *data) {
return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_END);
}
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RPCMEM_H_
#define MACE_CORE_RPCMEM_H_
#include "third_party/rpcmem/rpcmem.h"
namespace mace {
class Rpcmem {
public:
Rpcmem();
~Rpcmem();
void *New(int heapid, uint32_t flags, int nbytes);
void *New(int nbytes);
void Delete(void *data);
int ToFd(void *data);
int SyncCacheStart(void *data);
int SyncCacheEnd(void *data);
private:
rpcmem rm;
};
} // namespace mace
#endif // MACE_CORE_RPCMEM_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/hexagon/hexagon_allocator.h"
namespace mace {
MaceStatus HexagonAllocator::New(size_t nbytes, void **result) {
*result = rpcmem_.New(nbytes);
MACE_CHECK_NOTNULL(*result);
memset(*result, 0, nbytes);
return MaceStatus::MACE_SUCCESS;
}
void HexagonAllocator::Delete(void *data) {
rpcmem_.Delete(data);
}
bool HexagonAllocator::OnHost() const {
return true;
}
Rpcmem *HexagonAllocator::rpcmem() {
return &rpcmem_;
}
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
#include "mace/core/allocator.h"
namespace mace {
class HexagonAllocator : public Allocator {
public:
MaceStatus New(size_t nbytes, void **result) override;
void Delete(void *buffer) override;
bool OnHost() const override;
Rpcmem *rpcmem() override;
private:
Rpcmem rpcmem_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_
......@@ -25,27 +25,15 @@
#include "mace/public/mace.h"
namespace mace {
struct InOutInfo {
InOutInfo(const std::string &name,
const std::vector<index_t> &shape,
const DataType data_type,
const float scale,
const int32_t zero_point,
std::unique_ptr<Tensor> tensor_u8)
: name(name),
shape(shape),
data_type(data_type),
scale(scale),
zero_point(zero_point),
tensor_u8(std::move(tensor_u8)) {}
const DataType data_type)
: name(name), shape(shape), data_type(data_type) {}
std::string name;
std::vector<index_t> shape;
DataType data_type;
float scale;
int32_t zero_point;
std::unique_ptr<Tensor> tensor_u8;
};
class HexagonControlWrapper {
......@@ -79,8 +67,6 @@ class HexagonControlWrapper {
int nn_id_;
std::vector<InOutInfo> input_info_;
std::vector<InOutInfo> output_info_;
int num_inputs_;
int num_outputs_;
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/runtime/hexagon/hexagon_device.h"
namespace mace {
HexagonDevice::HexagonDevice(DeviceType device_type,
utils::ThreadPool *thread_pool
#ifdef MACE_ENABLE_OPENCL
, std::unique_ptr<GPUDevice> gpu_device
#endif // MACE_ENABLE_OPENCL
)
: CPUDevice(0, AFFINITY_NONE, thread_pool),
allocator_(make_unique<HexagonAllocator>()),
device_type_(device_type)
#ifdef MACE_ENABLE_OPENCL
, gpu_device_(std::move(gpu_device))
#endif // MACE_ENABLE_OPENCL
{}
#ifdef MACE_ENABLE_OPENCL
GPURuntime *HexagonDevice::gpu_runtime() {
return gpu_device_->gpu_runtime();
}
#endif // MACE_ENABLE_OPENCL
Allocator *HexagonDevice::allocator() {
#ifdef MACE_ENABLE_OPENCL
return gpu_device_->allocator();
#else
return allocator_.get();
#endif // MACE_ENABLE_OPENCL
}
DeviceType HexagonDevice::device_type() const {
return device_type_;
}
std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
Device *device) {
std::unique_ptr<HexagonControlWrapper> hexagon_controller;
auto device_type = device->device_type();
switch (device_type) {
#ifdef MACE_ENABLE_HEXAGON
case HEXAGON:
hexagon_controller = make_unique<HexagonDSPWrapper>();
break;
#endif
#ifdef MACE_ENABLE_HTA
case HTA:
hexagon_controller = make_unique<HexagonHTAWrapper>(device);
break;
#endif
default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
}
return hexagon_controller;
}
} // namespace mace
......@@ -19,6 +19,7 @@
#include <utility>
#include "mace/core/device.h"
#include "mace/core/runtime/hexagon/hexagon_allocator.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#ifdef MACE_ENABLE_HEXAGON
#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h"
......@@ -26,44 +27,39 @@
#ifdef MACE_ENABLE_HTA
#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
#endif
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/gpu_device.h"
#include "mace/core/runtime/opencl/gpu_runtime.h"
#endif
namespace mace {
class HexagonDevice : public CPUDevice {
public:
explicit HexagonDevice(DeviceType device_type,
utils::ThreadPool *thread_pool)
: CPUDevice(0, AFFINITY_NONE, thread_pool),
device_type_(device_type) {}
HexagonDevice(DeviceType device_type,
#ifdef MACE_ENABLE_OPENCL
utils::ThreadPool *thread_pool,
std::unique_ptr<GPUDevice> gpu_device);
#else
utils::ThreadPool *thread_pool);
#endif // MACE_ENABLE_OPENCL
DeviceType device_type() const override {
return device_type_;
};
#ifdef MACE_ENABLE_OPENCL
GPURuntime *gpu_runtime() override;
#endif // MACE_ENABLE_OPENCL
Allocator *allocator() override;
DeviceType device_type() const override;
private:
std::unique_ptr<HexagonAllocator> allocator_;
DeviceType device_type_;
#ifdef MACE_ENABLE_OPENCL
std::unique_ptr<GPUDevice> gpu_device_;
#endif // MACE_ENABLE_OPENCL
};
std::unique_ptr<HexagonControlWrapper> CreateHexagonControlWrapper(
Device *device) {
std::unique_ptr<HexagonControlWrapper> hexagon_controller;
auto device_type = device->device_type();
switch (device_type) {
#ifdef MACE_ENABLE_HEXAGON
case HEXAGON:
hexagon_controller = make_unique<HexagonDSPWrapper>();
break;
#endif
#ifdef MACE_ENABLE_HTA
case HTA:
hexagon_controller = make_unique<HexagonHTAWrapper>(device);
break;
#endif
default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type;
}
return hexagon_controller;
}
Device *device);
} // namespace mace
#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_
......@@ -107,8 +107,6 @@ hexagon_nn_corner_type TransformCornerType(HexagonNNCornerType corner) {
} // namespace
HexagonDSPWrapper::HexagonDSPWrapper() {
hexnn_controller_init();
std::string env_log_execute_time_str;
GetEnv("MACE_DSP_LOG_EXECUTE_TIME", &env_log_execute_time_str);
if (env_log_execute_time_str.empty()) {
......@@ -119,9 +117,7 @@ HexagonDSPWrapper::HexagonDSPWrapper() {
}
}
HexagonDSPWrapper::~HexagonDSPWrapper() {
hexnn_controller_deinit();
}
HexagonDSPWrapper::~HexagonDSPWrapper() {}
int HexagonDSPWrapper::GetVersion() {
int version;
......@@ -258,10 +254,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
}
input_info_.emplace_back(input_info.name(),
input_shape,
input_info.data_type(),
input_info.scale(),
input_info.zero_point(),
make_unique<Tensor>());
input_info.data_type());
}
// output info
......@@ -275,10 +268,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def,
}
output_info_.emplace_back(output_info.name(),
output_shape,
output_info.data_type(),
output_info.scale(),
output_info.zero_point(),
make_unique<Tensor>());
output_info.data_type());
VLOG(1) << "OutputInfo: "
<< "\n\t shape: " << output_shape[0] << " " << output_shape[1]
<< " " << output_shape[2] << " " << output_shape[3]
......
......@@ -56,7 +56,8 @@ class HexagonDSPWrapper : public HexagonControlWrapper {
uint64_t GetLastExecuteCycles();
bool log_execute_time_;
std::vector<InOutInfo> input_info_;
std::vector<InOutInfo> output_info_;
MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper);
};
} // namespace mace
......
此差异已折叠。
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
#include <memory>
#include <vector>
#include "mace/core/device.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/utils/math.h"
#include "mace/utils/thread_pool.h"
#include "third_party/hta/hta_hexagon_api.h"
namespace mace {
class BaseTransformer {
public:
BaseTransformer() = default;
virtual ~BaseTransformer() = default;
virtual void Init(Device *device) { device_ = device; }
virtual MaceStatus Compute(const Tensor *input, Tensor *output) = 0;
protected:
Device *device_;
};
class HexagonHTATranformerBase {
public:
HexagonHTATranformerBase() = default;
virtual ~HexagonHTATranformerBase() = default;
virtual void Init(Device *device) = 0;
virtual MaceStatus SetInputTransformer(
const hexagon_hta_hw_layout format) = 0;
virtual MaceStatus SetOutputTransformer(
const hexagon_hta_hw_layout format) = 0;
virtual MaceStatus Quantize(const Tensor *input, Tensor *output) = 0;
virtual MaceStatus Dequantize(const Tensor *input, Tensor *output) = 0;
virtual MaceStatus TransformInput(const Tensor *input,
Tensor *output,
int index) = 0;
virtual MaceStatus TransformOutput(const Tensor *input,
Tensor *output,
int index) = 0;
};
template <DeviceType D>
class HexagonHTATranformer : public HexagonHTATranformerBase {
public:
void Init(Device *device) override;
MaceStatus SetInputTransformer(const hexagon_hta_hw_layout format) override;
MaceStatus SetOutputTransformer(const hexagon_hta_hw_layout format) override;
MaceStatus Quantize(const Tensor *input, Tensor *output) override;
MaceStatus Dequantize(const Tensor *input, Tensor *output) override;
MaceStatus TransformInput(const Tensor *input,
Tensor *output,
int index) override;
MaceStatus TransformOutput(const Tensor *input,
Tensor *output,
int index) override;
private:
Device *device_;
std::unique_ptr<BaseTransformer> quantizer_;
std::unique_ptr<BaseTransformer> dequantizer_;
std::vector<std::unique_ptr<BaseTransformer>> input_transformers_;
std::vector<std::unique_ptr<BaseTransformer>> output_transformers_;
};
} // namespace mace
#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h"
#include <sys/types.h>
#include <algorithm>
#include <iomanip>
#include <map>
......@@ -24,48 +24,59 @@
#include <utility>
#include "mace/core/runtime/hexagon/hexagon_hta_ops.h"
#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
#include "mace/core/types.h"
#include "mace/utils/memory.h"
#include "mace/core/quantize.h"
#include "third_party/hta/hta_hexagon_api.h"
namespace mace {
namespace {
struct InputOutputMetadata {
void Init(float min_val, float max_val, int needs_quantization) {
this->min_val = min_val;
this->max_val = max_val;
this->needs_quantization = needs_quantization;
int GetHTAEnv(const std::string &name, int default_value) {
int value = default_value;
std::string env_str;
MaceStatus status = GetEnv(name.c_str(), &env_str);
if (status == MaceStatus::MACE_SUCCESS && !env_str.empty()) {
value = std::atoi(env_str.c_str());
}
float min_val;
float max_val;
int needs_quantization;
};
template<typename T>
void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
tensor->batches = 1;
tensor->height = 1;
tensor->width = 1;
tensor->depth = 1;
tensor->data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(&data));
tensor->dataLen = sizeof(data);
tensor->data_valid_len = sizeof(data);
tensor->unused = 0;
return value;
}
template<typename T>
void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) {
tensor->data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(&data));
tensor->dataLen = sizeof(data);
// Print the API logs to standard output.
void HtaApiLog(hexagon_hta_nn_nn_id id,
const char *api_op,
const char *const format,
...) {
va_list arg;
va_start(arg, format);
if (api_op != NULL) {
printf("Graph ID: %d\t", id);
}
vfprintf(stdout, format, arg);
va_end(arg);
}
// Print the performance stats to standard output.
void HtaPerformanceLog(int log_level,
uint32_t network_handle,
uint32_t thread_id,
const char *const format,
...) {
va_list arg;
va_start(arg, format);
printf("Log Level: %d, Network Handle: %d, Thread ID: %d - ", log_level,
network_handle, thread_id);
vfprintf(stdout, format, arg);
va_end(arg);
}
} // namespace
HexagonHTAWrapper::HexagonHTAWrapper(Device *device)
: quantize_util_(&device->cpu_runtime()->thread_pool()) {
: allocator_(device->allocator()),
#ifdef MACE_ENABLE_OPENCL
transformer_(make_unique<HexagonHTATranformer<GPU>>()) {
#else
transformer_(make_unique<HexagonHTATranformer<CPU>>()) {
#endif
transformer_->Init(device);
}
int HexagonHTAWrapper::GetVersion() {
......@@ -84,6 +95,40 @@ bool HexagonHTAWrapper::Init() {
LOG(INFO) << "Hexagon init";
MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed");
ResetPerfInfo();
int ret;
int power_level = GetHTAEnv("MACE_HTA_POWER_LEVEL", -1);
if (power_level != -1) {
ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_POWER_LEVEL,
&power_level, sizeof(power_level));
LOG(INFO) << "HTA_NN_CONFIG_POWER_LEVEL: " << power_level
<< " returns: " << ret;
}
int is_compress = GetHTAEnv("MACE_HTA_BANDWIDTH_COMPRESSION", 1);
if (is_compress) {
ret = hexagon_hta_nn_set_config_params(nn_id_,
HTA_NN_CONFIG_BANDWIDTH_COMPRESSION,
&is_compress, sizeof(is_compress));
LOG(INFO) << "HTA_NN_CONFIG_BANDWIDTH_COMPRESSION: " << is_compress
<< " returns: " << ret;
}
if (VLOG_IS_ON(2)) {
ret = hexagon_hta_nn_set_config_params(
nn_id_, HTA_NN_CONFIG_PERFORMANCE_LOG,
reinterpret_cast<void *>(&HtaPerformanceLog),
sizeof(&HtaPerformanceLog));
MACE_CHECK(ret == 0, "HTA_NN_CONFIG_PERFORMANCE_LOG returns: " , ret);
}
if (VLOG_IS_ON(3)) {
ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_API_LOG,
reinterpret_cast<void *>(&HtaApiLog),
sizeof(&HtaApiLog));
MACE_CHECK(ret == 0, "HTA_NN_CONFIG_API_LOG returns: ", ret);
}
return true;
}
......@@ -172,52 +217,107 @@ bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def,
outputs.size());
}
int64_t t1 = NowMicros();
MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
int64_t t2 = NowMicros();
VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
// input info
num_inputs_ = net_def.input_info_size();
input_info_.reserve(num_inputs_);
for (const InputOutputInfo &input_info : net_def.input_info()) {
input_tensordef_.resize(num_inputs_);
for (int index = 0; index < num_inputs_; ++index) {
auto input_info = net_def.input_info(index);
std::vector<index_t> input_shape(input_info.dims().begin(),
input_info.dims().end());
while (input_shape.size() < 4) {
input_shape.insert(input_shape.begin(), 1);
}
input_info_.emplace_back(input_info.name(),
input_shape,
input_info.data_type(),
input_info.scale(),
input_info.zero_point(),
make_unique<Tensor>());
auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
hexagon_hta_nn_hw_tensordef &input_tensordef = input_tensordef_[index];
memset(&input_tensordef, 0, sizeof(input_tensordef));
MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 0, index,
&input_tensordef) == 0);
input_tensordef.dataLen = input_tensordef.batchStride;
VLOG(1) << input_tensordef.format << " " << input_tensordef.elementSize
<< " " << input_tensordef.numDims << " "
<< input_tensordef.batchStride;
for (uint32_t i = 0; i < input_tensordef.numDims; ++i) {
VLOG(1) << input_tensordef.dim[i].length << " "
<< input_tensordef.dim[i].lpadding << " "
<< input_tensordef.dim[i].valid;
}
hta_tensor->Resize({input_tensordef.dataLen});
MACE_CHECK(hta_tensor->raw_size() == input_tensordef.dataLen);
Tensor::MappingGuard input_guard(hta_tensor.get());
input_tensordef.fd =
allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &input_tensordef) == 0);
transformer_->SetInputTransformer(input_tensordef.format);
input_info_.emplace_back(
input_info.name(), input_shape, input_info.data_type(),
input_info.scale(), input_info.zero_point(),
std::move(quantized_tensor),
std::move(hta_tensor));
}
// output info
num_outputs_ = net_def.output_info_size();
output_info_.reserve(num_outputs_);
for (const InputOutputInfo &output_info : net_def.output_info()) {
output_tensordef_.resize(num_outputs_);
for (int index = 0; index < num_outputs_; ++index) {
auto output_info = net_def.output_info(index);
std::vector<index_t> output_shape(output_info.dims().begin(),
output_info.dims().end());
while (output_shape.size() < 4) {
output_shape.insert(output_shape.begin(), 1);
}
output_info_.emplace_back(output_info.name(),
output_shape,
output_info.data_type(),
output_info.scale(),
output_info.zero_point(),
make_unique<Tensor>());
auto quantized_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
auto hta_tensor = make_unique<Tensor>(allocator_, DT_UINT8);
quantized_tensor->SetScale(output_info.scale());
quantized_tensor->SetZeroPoint(output_info.zero_point());
hexagon_hta_nn_hw_tensordef &output_tensordef = output_tensordef_[index];
memset(&output_tensordef, 0, sizeof(output_tensordef));
MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 1, index,
&output_tensordef) == 0);
output_tensordef.dataLen = output_tensordef.batchStride;
VLOG(1) << output_tensordef.format << " " << output_tensordef.elementSize
<< " " << output_tensordef.numDims << " "
<< output_tensordef.batchStride;
for (uint32_t i = 0; i < output_tensordef.numDims; ++i) {
VLOG(1) << output_tensordef.dim[i].length << " "
<< output_tensordef.dim[i].lpadding << " "
<< output_tensordef.dim[i].valid;
}
hta_tensor->Resize({output_tensordef.batchStride});
MACE_CHECK(hta_tensor->raw_size() == output_tensordef.dataLen);
Tensor::MappingGuard output_guard(hta_tensor.get());
output_tensordef.fd =
allocator_->rpcmem()->ToFd(hta_tensor->mutable_data<void>());
MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &output_tensordef) == 0);
transformer_->SetOutputTransformer(output_tensordef.format);
output_info_.emplace_back(
output_info.name(), output_shape, output_info.data_type(),
output_info.scale(), output_info.zero_point(),
std::move(quantized_tensor), std::move(hta_tensor));
VLOG(1) << "OutputInfo: "
<< "\n\t shape: " << output_shape[0] << " " << output_shape[1]
<< " " << output_shape[2] << " " << output_shape[3]
<< "\n\t type: " << output_info.data_type();
}
int64_t t1 = NowMicros();
MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed");
int64_t t2 = NowMicros();
VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
return true;
}
......@@ -266,78 +366,41 @@ bool HexagonHTAWrapper::ExecuteGraphNew(
MACE_CHECK(num_outputs_ == static_cast<int>(num_outputs),
"Wrong outputs num");
std::vector<hexagon_hta_nn_tensordef> inputs(num_inputs * kNumMetaData);
std::vector<hexagon_hta_nn_tensordef> outputs(num_outputs * kNumMetaData);
std::vector<InputOutputMetadata> input_metadata(num_inputs);
std::vector<InputOutputMetadata> output_metadata(num_outputs);
for (size_t i = 0; i < num_inputs; ++i) {
const auto input_tensor = input_tensors.at(input_info_[i].name);
const auto &input_shape = input_tensor->shape();
size_t index = i * kNumMetaData;
inputs[index].batches = static_cast<uint32_t>(input_shape[0]);
inputs[index].height = static_cast<uint32_t>(input_shape[1]);
inputs[index].width = static_cast<uint32_t>(input_shape[2]);
inputs[index].depth = static_cast<uint32_t>(input_shape[3]);
inputs[index].data = const_cast<unsigned char *>(
reinterpret_cast<const unsigned char *>(input_tensor->raw_data()));
inputs[index].dataLen = static_cast<int>(input_tensor->raw_size());
inputs[index].data_valid_len =
static_cast<uint32_t>(input_tensor->raw_size());
inputs[index].unused = 0;
input_metadata[i].Init(.0f, .0f, 1);
AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]);
AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]);
AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]);
}
input_tensor->SetScale(input_info_[i].scale);
input_tensor->SetZeroPoint(input_info_[i].zero_point);
MACE_CHECK_SUCCESS(
transformer_->Quantize(input_tensors.at(input_info_[i].name),
input_info_[i].quantized_tensor.get()));
// transform mace output to hexagon output
for (size_t i = 0; i < num_outputs; ++i) {
auto output_tensor = output_tensors->at(output_info_[i].name);
size_t index = i * kNumMetaData;
output_tensor->SetDtype(output_info_[i].data_type);
output_tensor->Resize(output_info_[i].shape);
outputs[index].data = reinterpret_cast<unsigned char *>(
output_tensor->raw_mutable_data());
outputs[index].dataLen = static_cast<int>(output_tensor->raw_size());
output_metadata[i].Init(.0f, .0f, 1);
AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]);
AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]);
AddOutputMetadata(output_metadata[i].needs_quantization,
&outputs[index + 3]);
MACE_CHECK_SUCCESS(transformer_->TransformInput(
input_info_[i].quantized_tensor.get(),
input_info_[i].hta_tensor.get(), i));
Tensor::MappingGuard input_guard(input_info_[i].hta_tensor.get());
}
int res = hexagon_hta_nn_execute_new(nn_id_,
inputs.data(),
num_inputs * kNumMetaData,
outputs.data(),
num_outputs * kNumMetaData);
MACE_CHECK(res == 0, "execute error");
MACE_CHECK(hexagon_hta_nn_execute_hw(nn_id_,
input_tensordef_.data(), num_inputs,
output_tensordef_.data(), num_outputs,
nullptr, nullptr) == 0);
for (size_t i = 0; i < num_outputs; ++i) {
size_t index = i * kNumMetaData;
std::vector<uint32_t> output_shape{
outputs[index].batches, outputs[index].height, outputs[index].width,
outputs[index].depth};
MACE_CHECK(output_shape.size() == output_info_[i].shape.size(),
output_shape.size(), " vs ", output_info_[i].shape.size(),
" wrong output shape inferred");
for (size_t j = 0; j < output_shape.size(); ++j) {
MACE_CHECK(static_cast<index_t>(output_shape[j])
== output_info_[i].shape[j],
output_shape[j], " vs ", output_info_[i].shape[j],
" wrong output shape[", j, "] inferred");
{ // To sync cache
Tensor::MappingGuard output_guard(output_info_[i].hta_tensor.get());
}
output_info_[i].quantized_tensor->Resize(output_info_[i].shape);
transformer_->TransformOutput(output_info_[i].hta_tensor.get(),
output_info_[i].quantized_tensor.get(), i);
auto output_tensor = output_tensors->at(output_info_[i].name);
MACE_CHECK(static_cast<index_t>(outputs[index].data_valid_len)
== output_tensor->raw_size(),
outputs[index].data_valid_len, " vs ", output_tensor->raw_size(),
" wrong output bytes inferred.");
MaceStatus st = transformer_->Dequantize(
output_info_[i].quantized_tensor.get(), output_tensor);
}
return res == 0;
return true;
}
} // namespace mace
......@@ -16,18 +16,40 @@
#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_
#include <map>
#include <memory>
#include <string>
#include <vector>
#include <utility>
#include "mace/utils/thread_pool.h"
#include "mace/core/quantize.h"
#include "mace/core/device.h"
#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
#include "mace/core/tensor.h"
#include "mace/core/device.h"
#include "mace/public/mace.h"
#include "third_party/hta/hta_hexagon_api.h"
namespace mace {
struct HTAInOutInfo : public InOutInfo {
HTAInOutInfo(const std::string &name,
const std::vector<index_t> &shape,
const DataType data_type,
const float scale,
const int32_t zero_point,
std::unique_ptr<Tensor> quantized_tensor,
std::unique_ptr<Tensor> hta_tensor)
: InOutInfo(name, shape, data_type),
scale(scale),
zero_point(zero_point),
quantized_tensor(std::move(quantized_tensor)),
hta_tensor(std::move(hta_tensor)) {}
float scale;
int32_t zero_point;
std::unique_ptr<Tensor> quantized_tensor;
std::unique_ptr<Tensor> hta_tensor;
};
class HexagonHTAWrapper : public HexagonControlWrapper {
public:
explicit HexagonHTAWrapper(Device *device);
......@@ -50,7 +72,12 @@ class HexagonHTAWrapper : public HexagonControlWrapper {
void SetDebugLevel(int level) override;
private:
QuantizeUtil<float, uint8_t> quantize_util_;
Allocator *allocator_;
std::vector<HTAInOutInfo> input_info_;
std::vector<HTAInOutInfo> output_info_;
std::vector<hexagon_hta_nn_hw_tensordef> input_tensordef_;
std::vector<hexagon_hta_nn_hw_tensordef> output_tensordef_;
std::unique_ptr<HexagonHTATranformerBase> transformer_;
MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper);
};
} // namespace mace
......
......@@ -16,9 +16,6 @@
#include "mace/core/runtime/opencl/opencl_allocator.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#ifdef MACE_ENABLE_RPCMEM
#include "third_party/rpcmem/rpcmem.h"
#endif // MACE_ENABLE_RPCMEM
namespace mace {
namespace {
......@@ -39,24 +36,10 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
}
}
#ifdef MACE_ENABLE_RPCMEM
std::once_flag ion_prepared;
void PrepareQualcommION() {
rpcmem_init();
std::atexit(rpcmem_deinit);
}
#endif // MACE_ENABLE_RPCMEM
} // namespace
OpenCLAllocator::OpenCLAllocator(
OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {
#ifdef MACE_ENABLE_RPCMEM
if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
std::call_once(ion_prepared, PrepareQualcommION);
}
#endif // MACE_ENABLE_RPCMEM
}
OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {}
OpenCLAllocator::~OpenCLAllocator() {}
......@@ -168,7 +151,7 @@ void OpenCLAllocator::Delete(void *buffer) {
if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
auto it = cl_to_host_map_.find(buffer);
MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL buffer not found!");
rpcmem_free(it->second);
rpcmem_.Delete(it->second);
cl_to_host_map_.erase(buffer);
}
#endif // MACE_ENABLE_RPCMEM
......@@ -184,7 +167,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
auto it = cl_to_host_map_.find(buffer);
MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL image not found!");
rpcmem_free(it->second);
rpcmem_.Delete(it->second);
cl_to_host_map_.erase(buffer);
}
#endif // MACE_ENABLE_RPCMEM
......@@ -194,7 +177,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
void *OpenCLAllocator::Map(void *buffer,
size_t offset,
size_t nbytes,
bool finish_cmd_queue) const {
bool finish_cmd_queue) {
MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
void *mapped_ptr = nullptr;
#ifdef MACE_ENABLE_RPCMEM
......@@ -209,7 +192,7 @@ void *OpenCLAllocator::Map(void *buffer,
if (opencl_runtime_->qcom_host_cache_policy() ==
CL_MEM_HOST_WRITEBACK_QCOM) {
MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
}
} else {
#endif // MACE_ENABLE_RPCMEM
......@@ -234,7 +217,7 @@ void *OpenCLAllocator::Map(void *buffer,
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch,
bool finish_cmd_queue) const {
bool finish_cmd_queue) {
MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
void *mapped_ptr = nullptr;
......@@ -251,7 +234,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
if (opencl_runtime_->qcom_host_cache_policy() ==
CL_MEM_HOST_WRITEBACK_QCOM) {
MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0);
}
} else {
#endif // MACE_ENABLE_RPCMEM
......@@ -275,13 +258,13 @@ void *OpenCLAllocator::MapImage(void *buffer,
return mapped_ptr;
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
#ifdef MACE_ENABLE_RPCMEM
if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
if (opencl_runtime_->qcom_host_cache_policy() ==
CL_MEM_HOST_WRITEBACK_QCOM) {
MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_END) == 0);
MACE_CHECK(rpcmem_.SyncCacheEnd(mapped_ptr) == 0);
}
} else {
#endif // MACE_ENABLE_RPCMEM
......@@ -301,17 +284,20 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
bool OpenCLAllocator::OnHost() const { return false; }
#ifdef MACE_ENABLE_RPCMEM
Rpcmem *OpenCLAllocator::rpcmem() {
return &rpcmem_;
}
void OpenCLAllocator::CreateQualcommBufferIONHostPtr(
const size_t nbytes,
cl_mem_ion_host_ptr *ion_host) {
void *host = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED,
nbytes + opencl_runtime_->qcom_ext_mem_padding());
void *host = rpcmem_.New(nbytes + opencl_runtime_->qcom_ext_mem_padding());
MACE_CHECK_NOTNULL(host);
auto host_addr = reinterpret_cast<std::uintptr_t>(host);
auto page_size = opencl_runtime_->qcom_page_size();
MACE_CHECK(host_addr % page_size == 0, "ION memory address: ", host_addr,
" must be aligned to page size: ", page_size);
int fd = rpcmem_to_fd(host);
int fd = rpcmem_.ToFd(host);
MACE_CHECK(fd >= 0, "Invalid rpcmem file descriptor: ", fd);
ion_host->ext_host_ptr.allocation_type = CL_MEM_ION_HOST_PTR_QCOM;
......
......@@ -48,17 +48,21 @@ class OpenCLAllocator : public Allocator {
void *Map(void *buffer,
size_t offset,
size_t nbytes,
bool finish_cmd_queue) const override;
bool finish_cmd_queue) override;
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> *mapped_image_pitch,
bool finish_cmd_queue) const override;
bool finish_cmd_queue) override;
void Unmap(void *buffer, void *mapped_ptr) const override;
void Unmap(void *buffer, void *mapped_ptr) override;
bool OnHost() const override;
#ifdef MACE_ENABLE_RPCMEM
Rpcmem *rpcmem() override;
#endif // MACE_ENABLE_RPCMEM
private:
#ifdef MACE_ENABLE_RPCMEM
void CreateQualcommBufferIONHostPtr(const size_t nbytes,
......@@ -69,6 +73,7 @@ class OpenCLAllocator : public Allocator {
cl_mem_ion_host_ptr *ion_host);
std::unordered_map<void *, void *> cl_to_host_map_;
Rpcmem rpcmem_;
#endif // MACE_ENABLE_RPCMEM
OpenCLRuntime *opencl_runtime_;
};
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include <algorithm>
#include <string>
......@@ -22,7 +22,6 @@
#include "mace/utils/math.h"
namespace mace {
namespace ops {
std::vector<index_t> FormatBufferShape(
const std::vector<index_t> &buffer_shape,
......@@ -59,8 +58,10 @@ std::string DtToCLDt(const DataType dt) {
return "float";
case DT_HALF:
return "half";
case DT_UINT8:
return "uchar";
default:
LOG(FATAL) << "Unsupported data type";
LOG(FATAL) << "Unsupported data type: " << dt;
return "";
}
}
......@@ -365,5 +366,4 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
return MaceStatus::MACE_SUCCESS;
}
} // namespace ops
} // namespace mace
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_OPENCL_HELPER_H_
#define MACE_OPS_OPENCL_HELPER_H_
#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
#define MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
#include <memory>
#include <string>
......@@ -30,7 +30,6 @@
#include "mace/utils/math.h"
namespace mace {
namespace ops {
// oorc for 'Out Of Range Check'
#define MACE_OUT_OF_RANGE_DEFINITION \
std::shared_ptr<BufferBase> oorc_flag;
......@@ -161,6 +160,5 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
const uint32_t *gws,
const uint32_t kwg_size);
} // namespace ops
} // namespace mace
#endif // MACE_OPS_OPENCL_HELPER_H_
#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_
......@@ -21,6 +21,7 @@ load(
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
"if_rpcmem_enabled",
)
cc_library(
......@@ -47,6 +48,8 @@ cc_library(
"-DMACE_ENABLE_HTA",
]) + if_apu_enabled([
"-DMACE_ENABLE_APU",
]) + if_rpcmem_enabled([
"-DMACE_ENABLE_RPCMEM",
]),
deps = [
"//mace/ops",
......
......@@ -520,9 +520,23 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
if (device_type_ == DeviceType::HEXAGON
|| device_type_ == DeviceType::HTA) {
#ifdef MACE_ENABLE_OPENCL
device_.reset(new HexagonDevice(
device_type_, thread_pool_.get(),
make_unique<GPUDevice>(
config.impl_->gpu_context()->opencl_tuner(),
config.impl_->gpu_context()->opencl_cache_storage(),
config.impl_->gpu_priority_hint(),
config.impl_->gpu_perf_hint(),
config.impl_->gpu_context()->opencl_binary_storage(),
config.impl_->num_threads(),
config.impl_->cpu_affinity_policy(),
thread_pool_.get())));
#else
device_.reset(new HexagonDevice(device_type_, thread_pool_.get()));
#endif // MACE_ENABLE_OPENCL
}
#endif
#endif // defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
#ifdef MACE_ENABLE_APU
if (device_type_ == DeviceType::APU) {
device_.reset(new ApuDevice(thread_pool_.get()));
......@@ -579,15 +593,19 @@ MaceStatus MaceEngine::Impl::Init(
<< MakeString(MapKeys(output_info_map_));
}
#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA)
DataType output_dt = output_info_map_[output_name].data_type();
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), output_dt);
output_tensor->set_data_format(DataFormat::NHWC);
if (device_type_ == HEXAGON || device_type_ == HTA) {
DataType output_dt = output_info_map_[output_name].data_type();
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), output_dt);
output_tensor->set_data_format(DataFormat::NHWC);
}
#endif
#if defined(MACE_ENABLE_APU)
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
output_tensor->set_data_format(DataFormat::NHWC);
if (device_type_ == DeviceType::APU) {
Tensor *output_tensor =
ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT);
output_tensor->set_data_format(DataFormat::NHWC);
}
#endif
}
#ifdef MACE_ENABLE_HEXAGON
......
......@@ -21,7 +21,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -15,7 +15,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include <vector>
#include "mace/ops/opencl/buffer/utils.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/memory.h"
namespace mace {
......
......@@ -15,7 +15,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -15,7 +15,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include <vector>
#include "mace/ops/opencl/buffer/utils.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/memory.h"
namespace mace {
......
......@@ -23,7 +23,7 @@
#include <vector>
#include "mace/ops/opencl/buffer/utils.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/memory.h"
namespace mace {
......
......@@ -19,7 +19,7 @@
#include <vector>
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -19,7 +19,7 @@
#include <vector>
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -209,3 +209,202 @@ __kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS
DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset));
vstore4(input_value, out_idx, output);
}
__kernel void buffer_quantize(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__private const float scale,
__private const int zero_point,
__global float *input,
__private const int input_offset,
__global uchar *output) {
const int out_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_idx >= global_size_dim0) {
return;
}
#endif
uchar4 output_value =
convert_uchar4_sat_rte(vload4(out_idx, input) / scale + zero_point);
vstore4(output_value, out_idx, output);
}
__kernel void buffer_dequantize(BUFFER_OUT_OF_RANGE_PARAMS
__private const int global_size_dim0,
__private const float scale,
__private const int zero_point,
__global uchar *input,
__private const int input_offset,
__global float *output) {
const int out_idx = get_global_id(0);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_idx >= global_size_dim0) {
return;
}
#endif
float4 output_value =
convert_float4(convert_int4(vload4(out_idx, input)) - zero_point) * scale;
vstore4(output_value, out_idx, output);
}
// NHWC -> NCHW (W roundup to 32)
__kernel void transform_nhwc_to_nchw32(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global uchar *input, // NHWC
__private const int input_offset,
__private const int zero_point,
__global uchar *output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channels) {
const int width_blk_idx = get_global_id(0);
const int h_idx = get_global_id(1);
const int bc_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_blk_idx >= global_size_dim0 ||
h_idx >= global_size_dim1 ||
bc_idx >= global_size_dim2) {
return;
}
#endif
const int b_idx = bc_idx / channels;
const int chan_idx = bc_idx - mul24(b_idx, channels);
const int w_idx = width_blk_idx << 2;
const int padded_width = global_size_dim0 << 2;
const int in_offset = mad24(mad24(mad24(b_idx, height, h_idx),
width, w_idx), channels, chan_idx) + input_offset;
const int out_offset = (mad24(mad24(mad24(b_idx, channels, chan_idx),
height, h_idx), padded_width, w_idx));
uchar4 value = zero_point;
if (w_idx + 3 < width) {
value.x = input[in_offset];
value.y = input[in_offset + channels];
value.z = input[in_offset + 2 * channels];
value.w = input[in_offset + 3 * channels];
} else if (w_idx < width) {
const int diff = width - w_idx;
switch(diff) {
case 3:
value.z = input[in_offset + 2 * channels];
case 2:
value.y = input[in_offset + channels];
case 1:
value.x = input[in_offset];
}
}
VSTORE4(value, output, out_offset);
}
// N H ceil(C/32) W 32 -> NHWC
__kernel void transform_d32_to_nhwc(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global uchar *input, // D32
__private const int input_offset,
__global uchar *output, // NHWC
__private const int batch,
__private const int height,
__private const int width,
__private const int channels,
__private const int channel_slices) {
const int chan_blk_idx = get_global_id(0);
const int w_idx = get_global_id(1);
const int bh_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 ||
w_idx >= global_size_dim1 ||
bh_idx >= global_size_dim2) {
return;
}
#endif
const int b_idx = bh_idx / height;
const int h_idx = bh_idx - mul24(b_idx, height);
const int c_idx = chan_blk_idx << 2;
const int c_slice = c_idx >> 5;
const int c_slice_idx = c_idx & 31;
const int in_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
channel_slices, c_slice), width, w_idx), 32, c_slice_idx) + input_offset;
const int out_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
width, w_idx), channels, c_idx));
uchar4 value = vload4(0, input + in_offset);
if (c_idx + 3 < channels) {
VSTORE4(value, output, out_offset);
} else {
const int diff = channels - c_idx;
switch(diff) {
case 3:
vstore3(value.xyz, 0, output + out_offset);
break;
case 2:
vstore2(value.xy, 0, output + out_offset);
break;
case 1:
output[out_offset] = value.x;
break;
}
}
}
// NHWC -> N H ceil(C/32) W 32
__kernel void transform_nhwc_to_d32(BUFFER_OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__global uchar *input, // NHWC
__private const int input_offset,
__private const int zero_point,
__global uchar *output, // D32
__private const int batch,
__private const int height,
__private const int width,
__private const int channels,
__private const int channel_slices) {
const int w_32_idx = get_global_id(0);
const int c_slice = get_global_id(1);
const int bh_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (w_32_idx >= global_size_dim0 ||
c_slice >= global_size_dim1 ||
bh_idx >= global_size_dim2) {
return;
}
#endif
const int b_idx = bh_idx / height;
const int h_idx = bh_idx - mul24(b_idx, height);
const int w_idx = w_32_idx >> 3;
const int c_slice_blk_idx = w_32_idx & 7;
const int c_slice_idx = c_slice_blk_idx << 2;
const int c_idx = (c_slice << 5) + c_slice_idx;
const int in_offset = (mad24(mad24(mad24(b_idx, height, h_idx),
width, w_idx), channels, c_idx)) + input_offset;
const int out_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx),
channel_slices, c_slice), width, w_idx), 32, c_slice_idx);
uchar4 value = zero_point;
if (c_idx + 3 < channels) {
value = vload4(0, input + in_offset);
} else if (c_idx < channels) {
value = vload4(0, input + in_offset);
const int diff = channels - c_idx;
switch(diff) {
case 3:
value.w = zero_point; break;
case 2:
value.zw = zero_point; break;
case 1:
value.yzw = zero_point; break;
}
} // else value = zero_point
VSTORE4(value, output, out_offset);
}
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -15,7 +15,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -15,7 +15,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/math.h"
namespace mace {
......
......@@ -14,7 +14,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/ops/common/activation_type.h"
#include "mace/utils/math.h"
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -25,7 +25,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/eltwise_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -21,7 +21,7 @@
#include "mace/core/op_context.h"
#include "mace/ops/opencl/buffer_transform_kernel.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -17,7 +17,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/ops/opencl/lpnorm.h"
namespace mace {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -20,7 +20,7 @@
#include "mace/core/op_context.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/ops/opencl/mvnorm.h"
namespace mace {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/common/pad_type.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/ops/common/reduce_type.h"
namespace mace {
......
......@@ -21,7 +21,7 @@
#include <memory>
#include "mace/core/operator.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/ops/opencl/buffer_transform_kernel.h"
namespace mace {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -24,7 +24,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -23,7 +23,7 @@
#include "mace/core/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
namespace mace {
namespace ops {
......
......@@ -16,7 +16,7 @@
#include "mace/core/op_context.h"
#include "mace/ops/common/activation_type.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/memory.h"
#include "mace/utils/math.h"
......
......@@ -175,9 +175,9 @@ bool RunModel(const std::string &model_name,
if (status != MaceStatus::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
#ifdef MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA)
std::shared_ptr<GPUContext> gpu_context;
if (device_type == DeviceType::GPU) {
if (device_type == DeviceType::GPU || device_type == DeviceType::HTA) {
const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
const std::string storage_path =
std::string(storage_path_ptr == nullptr ?
......
......@@ -9,6 +9,7 @@ load(
"if_android",
"if_android_armv7",
"if_hexagon_enabled",
"if_hta_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
......@@ -41,7 +42,9 @@ cc_test(
[
"mace/ops/opencl/*.cc",
]
)),
)) + if_hta_enabled([
"mace/core/runtime/hexagon/hta_transform_test.cc",
]),
copts = [
"-Werror",
"-Wextra",
......@@ -59,6 +62,8 @@ cc_test(
"-DMACE_ENABLE_QUANTIZE",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
......
......@@ -7,6 +7,10 @@ file(GLOB MACE_CC_TEST_SRCS
mace/libmace/*.cc
)
if(MACE_ENABLE_HTA)
set(MACE_CC_TEST_SRCS ${MACE_CC_TEST_SRCS} mace/core/runtime/hexagon/hta_transform_test.cc)
endif(MACE_ENABLE_HTA)
add_executable(mace_cc_test ${MACE_CC_TEST_SRCS})
target_link_libraries(mace_cc_test PUBLIC
mace_cc_test_utils
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/quantize.h"
#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class HTATransformTest : public OpsTestBase {};
namespace {
template <DeviceType D>
void TestHTAQuantizeDequantize(const std::vector<float> &input) {
float min_val, max_val;
FindMinMax(input.data(), input.size(), &min_val, &max_val);
float scale;
int32_t zero;
AdjustRange<uint8_t>(min_val, max_val, false, &scale, &zero);
OpsTestNet net;
Device *device = OpTestContext::Get()->GetDevice(D);
net.AddInputFromArray<D, float>("Input",
{static_cast<index_t>(input.size())},
input);
Tensor *input_tensor = net.GetOutput("Input");
input_tensor->SetScale(scale);
input_tensor->SetZeroPoint(zero);
Tensor *quantized_output = net.ws()->CreateTensor(
"QuantizedOutput", device->allocator(), DT_UINT8);
Tensor *dequantized_output = net.ws()->CreateTensor(
"DequantizedOutput", device->allocator(), DT_FLOAT);
mace::HexagonHTATranformer<D> transformer;
transformer.Init(device);
transformer.Quantize(input_tensor, quantized_output);
transformer.Dequantize(quantized_output, dequantized_output);
ExpectTensorNear<float>(*input_tensor,
*dequantized_output,
0.1);
}
} // namespace
TEST_F(HTATransformTest, TestHTAQuantize) {
TestHTAQuantizeDequantize<CPU>({-2, -1, 0, 1, 2, 3, 4});
TestHTAQuantizeDequantize<GPU>({-2, -1, 0, 1, 2, 3, 4});
}
namespace {
void TestHTAInputTransform(const std::vector<index_t> &input_shape,
const hexagon_hta_hw_layout format) {
OpsTestNet net;
Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
Tensor *input_tensor = net.GetOutput("Input");
input_tensor->SetScale(0.1);
input_tensor->SetZeroPoint(1);
Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
"CpuTransformedOutput", device->allocator(), DT_UINT8);
Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
"GpuTransformedOutput", device->allocator(), DT_UINT8);
mace::HexagonHTATranformer<CPU> cpu_transformer;
mace::HexagonHTATranformer<GPU> gpu_transformer;
cpu_transformer.Init(device);
gpu_transformer.Init(device);
cpu_transformer.SetInputTransformer(format);
gpu_transformer.SetInputTransformer(format);
cpu_transformer.TransformInput(input_tensor, cpu_transformed_tensor, 0);
gpu_transformer.TransformInput(input_tensor, gpu_transformed_tensor, 0);
net.Sync();
ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
}
} // namespace
TEST_F(HTATransformTest, TestHTAInputTransform) {
TestHTAInputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_PLANAR);
TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR);
TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAInputTransform({1, 15, 33, 27}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAInputTransform({1, 15, 33, 35}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_D32);
}
namespace {
void TestHTAOutputTransform(const std::vector<index_t> &output_shape,
const hexagon_hta_hw_layout format) {
index_t batch = output_shape[0];
index_t height = output_shape[1];
index_t width = output_shape[2];
index_t channels = output_shape[3];
MACE_CHECK(format == HEXAGON_HTA_HW_FORMAT_D32);
std::vector<index_t> input_shape {
batch, height, RoundUpDiv<index_t>(channels, 32), width, 32};
OpsTestNet net;
Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU);
net.AddRandomInput<GPU, uint8_t>("Input", input_shape);
Tensor *input_tensor = net.GetOutput("Input");
Tensor *cpu_transformed_tensor = net.ws()->CreateTensor(
"CpuTransformedOutput", device->allocator(), DT_UINT8);
Tensor *gpu_transformed_tensor = net.ws()->CreateTensor(
"GpuTransformedOutput", device->allocator(), DT_UINT8);
cpu_transformed_tensor->Resize(output_shape);
gpu_transformed_tensor->Resize(output_shape);
mace::HexagonHTATranformer<CPU> cpu_transformer;
mace::HexagonHTATranformer<GPU> gpu_transformer;
cpu_transformer.Init(device);
gpu_transformer.Init(device);
cpu_transformer.SetOutputTransformer(format);
gpu_transformer.SetOutputTransformer(format);
cpu_transformer.TransformOutput(input_tensor, cpu_transformed_tensor, 0);
gpu_transformer.TransformOutput(input_tensor, gpu_transformed_tensor, 0);
net.Sync();
ExpectTensorNear<uint8_t>(*cpu_transformed_tensor, *gpu_transformed_tensor);
}
} // namespace
TEST_F(HTATransformTest, TestHTAOutputTransform) {
TestHTAOutputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAOutputTransform({1, 19, 31, 27}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAOutputTransform({1, 19, 31, 35}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAOutputTransform({1, 224, 224, 2}, HEXAGON_HTA_HW_FORMAT_D32);
TestHTAOutputTransform({1, 384, 384, 3}, HEXAGON_HTA_HW_FORMAT_D32);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -21,7 +21,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/ops/opencl/helper.h"
#include "mace/core/runtime/opencl/opencl_helper.h"
#include "mace/utils/memory.h"
namespace mace {
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/quantize.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......
......@@ -250,8 +250,24 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
Tensor::MappingGuard y_mapper(&y);
auto a = x.data<EXP_TYPE>();
auto b = y.data<RES_TYPE>();
for (int i = 0; i < x.size(); ++i) {
ExpectEqual(a[i], b[i]);
if (x.dim_size() == 4) {
for (int n = 0; n < x.dim(0); ++n) {
for (int h = 0; h < x.dim(1); ++h) {
for (int w = 0; w < x.dim(2); ++w) {
for (int c = 0; c < x.dim(3); ++c) {
EXPECT_EQ(*a, *b) << "with index = [" << n << ", " << h << ", "
<< w << ", " << c << "]";
a++;
b++;
}
}
}
}
} else {
for (int i = 0; i < x.size(); ++i) {
EXPECT_EQ(a[i], b[i])
<< "a = " << a << " b = " << b << " index = " << i;
}
}
}
......
/*
* Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
* Copyright (c) 2019, The Linux Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted (subject to the limitations in the
......@@ -42,10 +42,6 @@ extern "C" {
int hexnn_controller_request_unsigned_pd();
int hexnn_controller_init();
int hexnn_controller_deinit();
#ifdef __cplusplus
}
#endif
......
......@@ -6,6 +6,8 @@
#ifndef RPCMEM_H
#define RPCMEM_H
#include <stdint.h>
/**
* RPCMEM_DEFAULT_HEAP
* Dynamicaly select the heap to use. This should be ok for most usecases.
......@@ -77,16 +79,25 @@
extern "C" {
#endif
typedef struct rpcmem rpcmem;
struct rpcmem {
void *lst; // QList*
void *mt; // pthread_mutex_t*
int ionfd;
int flag;
int ionversion;
};
/**
* call once to initialize the library
* NOTE: rpcmem_init is not thread safe
* NOTE: rpcmem_init is now thread safe
*/
void rpcmem_init(void);
void rpcmem_init(rpcmem *rm);
/**
* call once for cleanup
* NOTE: rpcmem_deinit is not thread safe
* NOTE: rpcmem_deinit is now thread safe
*/
void rpcmem_deinit(void);
void rpcmem_deinit(rpcmem *rm);
/**
* Allocate via ION a buffer of size
......@@ -96,10 +107,10 @@ void rpcmem_deinit(void);
* @retval, 0 on failure, pointer to buffer on success
*
* For example:
* buf = rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
* buf = rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
*/
void* rpcmem_alloc(int heapid, unsigned int flags, int size);
void* rpcmem_alloc(rpcmem *rm, int heapid, uint32_t flags, int size);
/**
* allocate with default settings
......@@ -107,24 +118,24 @@ void* rpcmem_alloc(int heapid, unsigned int flags, int size);
#if !defined(WINNT) && !defined (_WIN32_WINNT)
__attribute__((unused))
#endif
static __inline void* rpcmem_alloc_def(int size) {
return rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
static __inline void* rpcmem_alloc_def(rpcmem *rm, int size) {
return rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
}
/**
* free buffer, ignores invalid buffers
*/
void rpcmem_free(void* po);
void rpcmem_free(rpcmem *rm, void* po);
/**
* returns associated fd
*/
int rpcmem_to_fd(void* po);
int rpcmem_to_fd(rpcmem *rm, void* po);
/**
* cache coherency management
*/
int rpcmem_sync_cache(void* po, unsigned int flags);
int rpcmem_sync_cache(rpcmem *rm, void* po, uint32_t flags);
#ifdef __cplusplus
}
......
......@@ -21,6 +21,11 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_dsp
mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu
mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_apu
if [[ "$BUILD_HTA" == "1" ]]; then
mkdir -p $LIB_DIR/armeabi-v7a/cpu_gpu_hta
mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_hta
fi
rm -rf $LIB_DIR/linux-x86-64
mkdir -p $LIB_DIR/linux-x86-64
......@@ -33,6 +38,18 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
# build shared libraries
if [[ "$BUILD_HTA" == "1" ]]; then
echo "build shared lib for armeabi-v7a + cpu_gpu_hta"
bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
echo "build shared lib for arm64-v8a + cpu_gpu_hta"
bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
fi
echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
......@@ -71,6 +88,18 @@ if [[ "$OSTYPE" != "darwin"* ]];then
fi
# build static libraries
if [[ "$BUILD_HTA" == "1" ]]; then
echo "build static lib for armeabi-v7a + cpu_gpu_hta"
bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/
echo "build static lib for arm64-v8a + cpu_gpu_hta"
bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_hta/
cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/
fi
echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
......
......@@ -100,6 +100,11 @@ def parse_args():
type=str2bool,
default=True,
help="Whether to use rpcmem")
parser.add_argument(
"--enable_hta",
type=str2bool,
default=False,
help="Whether to use hta")
parser.add_argument(
'--address_sanitizer',
action="store_true",
......@@ -170,6 +175,7 @@ def main(unused_args):
enable_neon=FLAGS.enable_neon,
enable_quantize=FLAGS.enable_quantize,
enable_rpcmem=FLAGS.enable_rpcmem,
enable_hta=FLAGS.enable_hta,
address_sanitizer=FLAGS.address_sanitizer,
debug_mode=FLAGS.debug_mode)
if FLAGS.run_target:
......
......@@ -218,7 +218,8 @@ def get_opencl_mode(configs):
YAMLKeyword.runtime, "")
runtime_list.append(model_runtime.lower())
if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list:
if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list \
or RuntimeType.hta in runtime_list:
return True
return False
......
......@@ -295,6 +295,20 @@ class HexagonConverter(base_converter.ConverterInterface):
else:
index += 1
if self._option.device == DeviceType.HTA.value:
# replace QuantizeINPUT_f_to_8 with INPUT
quantize_input_op.type = HexagonOp.INPUT.name
del quantize_input_op.output_shape[1:]
del quantize_input_op.output_type[1:]
del quantize_input_op.out_max_byte_size[1:]
# replace first op's input min max with constant
self.add_constant_min_max_for_first_op(self._model.op[1])
# replace DequantizeOUTPUT_8tof with OUTPUT
dequantize_output_op.type = HexagonOp.OUTPUT.name
del dequantize_output_op.input[1:]
return quantize_input_op.output
def add_node_id(self, model_inputs):
......
......@@ -605,18 +605,20 @@ def create_internal_storage_dir(serialno, phone_data_dir):
def push_depended_so_libs(libmace_dynamic_library_path,
abi, phone_data_dir, serialno):
dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
libmace_dynamic_library_path)
src_file = ""
for dep in split_stdout(dep_so_libs):
if dep == "libgnustl_shared.so":
src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
"%s/libgnustl_shared.so" \
% (os.environ["ANDROID_NDK_HOME"], abi)
elif dep == "libc++_shared.so":
src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
"%s/libc++_shared.so"\
% (os.environ["ANDROID_NDK_HOME"], abi)
src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \
"%s/libc++_shared.so" \
% (os.environ["ANDROID_NDK_HOME"], abi)
try:
dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends",
libmace_dynamic_library_path)
except sh.ErrorReturnCode_127:
print("Find no ndk-depends, use default libc++_shared.so")
else:
for dep in split_stdout(dep_so_libs):
if dep == "libgnustl_shared.so":
src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \
"%s/libgnustl_shared.so" \
% (os.environ["ANDROID_NDK_HOME"], abi)
print("push %s to %s" % (src_file, phone_data_dir))
adb_push(src_file, phone_data_dir, serialno)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册