diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index fcb7b20773fafa2c48b9e1636271308ec161d8cf..971b2a271c389b11c61f37e1def1ce49b4537a2e 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -28,21 +28,37 @@ cc_library( "*.cc", "runtime/cpu/*.cc", ], + exclude = [ + "rpcmem.cc", + ], ) + if_opencl_enabled(glob( [ "runtime/opencl/*.cc", ], - )) + if_hexagon_enabled([ + )) + if_hexagon_or_hta_enabled([ + "runtime/hexagon/hexagon_allocator.h", + "runtime/hexagon/hexagon_allocator.cc", + "runtime/hexagon/hexagon_device.cc", + ]) + if_hexagon_enabled([ "runtime/hexagon/hexagon_dsp_wrapper.cc", ]) + if_hta_enabled([ + "runtime/hexagon/hexagon_hta_transformer.h", + "runtime/hexagon/hexagon_hta_transformer.cc", "runtime/hexagon/hexagon_hta_wrapper.cc", ]) + if_apu_enabled(glob([ "runtime/apu/*.cc", - ])), - hdrs = glob([ - "*.h", - "runtime/cpu/*.h", - ]) + if_opencl_enabled(glob([ + ])) + if_rpcmem_enabled([ + "rpcmem.cc", + ]), + hdrs = glob( + [ + "*.h", + "runtime/cpu/*.h", + ], + exclude = [ + "rpcmem.h", + ], + ) + if_opencl_enabled(glob([ "runtime/opencl/*.h", ])) + if_hexagon_or_hta_enabled(glob([ "runtime/hexagon/hexagon_control_wrapper.h", @@ -53,7 +69,9 @@ cc_library( "runtime/hexagon/*hta*.h", ])) + if_apu_enabled(glob([ "runtime/apu/*.h" - ])), + ])) + if_rpcmem_enabled([ + "rpcmem.h", + ]), copts = [ "-Werror", "-Wextra", diff --git a/mace/core/CMakeLists.txt b/mace/core/CMakeLists.txt index 75b74bb9005d321355332de46f5921ba7d638822..25ab20bff9167b3936f8fb2101c3c9165016ea46 100644 --- a/mace/core/CMakeLists.txt +++ b/mace/core/CMakeLists.txt @@ -17,11 +17,17 @@ set(CORE_SRCS runtime/cpu/cpu_runtime.cc ) +if(MACE_ENABLE_RPCMEM) + set(CORE_SRCS ${CORE_SRCS} rpcmem.cc) + set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem) +endif(MACE_ENABLE_RPCMEM) + if(MACE_ENABLE_OPENCL) set(CORE_SRCS ${CORE_SRCS} runtime/opencl/gpu_device.cc runtime/opencl/gpu_runtime.cc runtime/opencl/opencl_allocator.cc + runtime/opencl/opencl_helper.cc runtime/opencl/opencl_runtime.cc runtime/opencl/opencl_util.cc runtime/opencl/opencl_wrapper.cc @@ -30,12 +36,20 @@ if(MACE_ENABLE_OPENCL) set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} generated_opencl_kernel dl) endif(MACE_ENABLE_OPENCL) +if(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA) + set(CORE_SRCS ${CORE_SRCS} + runtime/hexagon/hexagon_allocator.cc + runtime/hexagon/hexagon_device.cc + ) +endif(MACE_ENABLE_HEXAGON_DSP OR MACE_ENABLE_HEXAGON_HTA) + if(MACE_ENABLE_HEXAGON_DSP) set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_dsp_wrapper.cc) set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hexagon_controller) endif(MACE_ENABLE_HEXAGON_DSP) if(MACE_ENABLE_HEXAGON_HTA) + set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_transformer.cc) set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_wrapper.cc) set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_hexagon_runtime) endif(MACE_ENABLE_HEXAGON_HTA) @@ -45,10 +59,6 @@ if(MACE_ENABLE_MTK_APU) set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} apu-frontend) endif(MACE_ENABLE_MTK_APU) -if(MACE_ENABLE_RPCMEM) - set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem) -endif(MACE_ENABLE_RPCMEM) - add_library(core STATIC ${CORE_SRCS}) target_link_libraries(core PRIVATE proto diff --git a/mace/core/allocator.cc b/mace/core/allocator.cc index 003b1c2c02e2907651c56193180439a08223f97a..28ceeedcd4828154ab4a53cc2a9198849252ae94 100644 --- a/mace/core/allocator.cc +++ b/mace/core/allocator.cc @@ -16,6 +16,54 @@ namespace mace { +MaceStatus Allocator::NewImage(const std::vector &image_shape, + const DataType dt, + void **result) { + MACE_UNUSED(image_shape); + MACE_UNUSED(dt); + MACE_UNUSED(result); + MACE_NOT_IMPLEMENTED; + return MaceStatus::MACE_SUCCESS; +} + +void Allocator::DeleteImage(void *data) { + MACE_UNUSED(data); + MACE_NOT_IMPLEMENTED; +} + +void *Allocator::Map(void *buffer, + size_t offset, + size_t nbytes, + bool finish_cmd_queue) { + MACE_UNUSED(nbytes); + MACE_UNUSED(finish_cmd_queue); + return reinterpret_cast(buffer) + offset; +} + +void *Allocator::MapImage(void *buffer, + const std::vector &image_shape, + std::vector *mapped_image_pitch, + bool finish_cmd_queue) { + MACE_UNUSED(buffer); + MACE_UNUSED(image_shape); + MACE_UNUSED(mapped_image_pitch); + MACE_UNUSED(finish_cmd_queue); + MACE_NOT_IMPLEMENTED; + return nullptr; +} + +void Allocator::Unmap(void *buffer, void *mapper_ptr) { + MACE_UNUSED(buffer); + MACE_UNUSED(mapper_ptr); +} + +#ifdef MACE_ENABLE_RPCMEM +Rpcmem *Allocator::rpcmem() { + MACE_NOT_IMPLEMENTED; + return nullptr; +} +#endif // MACE_ENABLE_RPCMEM + Allocator *GetCPUAllocator() { static CPUAllocator allocator; return &allocator; diff --git a/mace/core/allocator.h b/mace/core/allocator.h index e4e7b35f0b58a9e98a70bfa6f3559ab1e039463b..c9f8f8f6c10db6e27047aa9aedf42dd8eb217a18 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -27,6 +27,10 @@ #include "mace/public/mace.h" #include "mace/utils/logging.h" +#ifdef MACE_ENABLE_RPCMEM +#include "mace/core/rpcmem.h" +#endif // MACE_ENABLE_RPCMEM + namespace mace { #if defined(__hexagon__) @@ -50,19 +54,22 @@ class Allocator { virtual MaceStatus New(size_t nbytes, void **result) = 0; virtual MaceStatus NewImage(const std::vector &image_shape, const DataType dt, - void **result) = 0; + void **result); virtual void Delete(void *data) = 0; - virtual void DeleteImage(void *data) = 0; + virtual void DeleteImage(void *data); virtual void *Map(void *buffer, size_t offset, size_t nbytes, - bool finish_cmd_queue) const = 0; + bool finish_cmd_queue); virtual void *MapImage(void *buffer, const std::vector &image_shape, std::vector *mapped_image_pitch, - bool finish_cmd_queue) const = 0; - virtual void Unmap(void *buffer, void *mapper_ptr) const = 0; + bool finish_cmd_queue); + virtual void Unmap(void *buffer, void *mapper_ptr); virtual bool OnHost() const = 0; +#ifdef MACE_ENABLE_RPCMEM + virtual Rpcmem *rpcmem(); +#endif // MACE_ENABLE_RPCMEM }; class CPUAllocator : public Allocator { @@ -84,46 +91,12 @@ class CPUAllocator : public Allocator { return MaceStatus::MACE_SUCCESS; } - MaceStatus NewImage(const std::vector &shape, - const DataType dt, - void **result) override { - MACE_UNUSED(shape); - MACE_UNUSED(dt); - MACE_UNUSED(result); - LOG(FATAL) << "Allocate CPU image"; - return MaceStatus::MACE_SUCCESS; - } - void Delete(void *data) override { MACE_CHECK_NOTNULL(data); VLOG(3) << "Free CPU buffer"; free(data); } - void DeleteImage(void *data) override { - LOG(FATAL) << "Free CPU image"; - free(data); - }; - void *Map(void *buffer, - size_t offset, - size_t nbytes, - bool finish_cmd_queue) const override { - MACE_UNUSED(nbytes); - MACE_UNUSED(finish_cmd_queue); - return reinterpret_cast(buffer) + offset; - } - void *MapImage(void *buffer, - const std::vector &image_shape, - std::vector *mapped_image_pitch, - bool finish_cmd_queue) const override { - MACE_UNUSED(image_shape); - MACE_UNUSED(mapped_image_pitch); - MACE_UNUSED(finish_cmd_queue); - return buffer; - } - void Unmap(void *buffer, void *mapper_ptr) const override { - MACE_UNUSED(buffer); - MACE_UNUSED(mapper_ptr); - } + bool OnHost() const override { return true; } }; diff --git a/mace/core/quantize.h b/mace/core/quantize.h index c7b6666dda378e99ad7a99714a6c7dca74b41619..439c9522b20212495e3fc7d0e2a5a9b79df012f0 100644 --- a/mace/core/quantize.h +++ b/mace/core/quantize.h @@ -117,9 +117,12 @@ inline void GetOutputMultiplierAndShift( template class QuantizeUtil { public: + QuantizeUtil() = default; explicit QuantizeUtil(utils::ThreadPool *thread_pool) : thread_pool_(thread_pool) {} + void Init(utils::ThreadPool *thread_pool) { thread_pool_ = thread_pool; } + void QuantizeWithScaleAndZeropoint(const float *input, const index_t size, float scale, diff --git a/mace/core/rpcmem.cc b/mace/core/rpcmem.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3b3ed396ab1f7199a2f2f2dad109c7fc582d634 --- /dev/null +++ b/mace/core/rpcmem.cc @@ -0,0 +1,54 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/rpcmem.h" + +#include "mace/utils/logging.h" + +namespace mace { + +Rpcmem::Rpcmem() { + rpcmem_init(&rm); + MACE_CHECK(rm.flag == 0, "rpcmem_init failed!"); +} + +Rpcmem::~Rpcmem() { + rpcmem_deinit(&rm); +} + +void *Rpcmem::New(int heapid, uint32_t flags, int nbytes) { + return rpcmem_alloc(&rm, heapid, flags, nbytes); +} + +void *Rpcmem::New(int nbytes) { + return New(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED, nbytes); +} + +void Rpcmem::Delete(void *data) { + rpcmem_free(&rm, data); +} + +int Rpcmem::ToFd(void *data) { + return rpcmem_to_fd(&rm, data); +} + +int Rpcmem::SyncCacheStart(void *data) { + return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_START); +} + +int Rpcmem::SyncCacheEnd(void *data) { + return rpcmem_sync_cache(&rm, data, RPCMEM_SYNC_END); +} + +} // namespace mace diff --git a/mace/core/rpcmem.h b/mace/core/rpcmem.h new file mode 100644 index 0000000000000000000000000000000000000000..7d5795c21c0541705f3bcf0d9b04f1e74fd024e2 --- /dev/null +++ b/mace/core/rpcmem.h @@ -0,0 +1,36 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RPCMEM_H_ +#define MACE_CORE_RPCMEM_H_ + +#include "third_party/rpcmem/rpcmem.h" + +namespace mace { +class Rpcmem { + public: + Rpcmem(); + ~Rpcmem(); + void *New(int heapid, uint32_t flags, int nbytes); + void *New(int nbytes); + void Delete(void *data); + int ToFd(void *data); + int SyncCacheStart(void *data); + int SyncCacheEnd(void *data); + + private: + rpcmem rm; +}; +} // namespace mace +#endif // MACE_CORE_RPCMEM_H_ diff --git a/mace/core/runtime/hexagon/hexagon_allocator.cc b/mace/core/runtime/hexagon/hexagon_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..2c1f034ca5a0db2aca3295b84449077d71fe62d1 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_allocator.cc @@ -0,0 +1,38 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/hexagon/hexagon_allocator.h" + +namespace mace { + +MaceStatus HexagonAllocator::New(size_t nbytes, void **result) { + *result = rpcmem_.New(nbytes); + MACE_CHECK_NOTNULL(*result); + memset(*result, 0, nbytes); + return MaceStatus::MACE_SUCCESS; +} + +void HexagonAllocator::Delete(void *data) { + rpcmem_.Delete(data); +} + +bool HexagonAllocator::OnHost() const { + return true; +} + +Rpcmem *HexagonAllocator::rpcmem() { + return &rpcmem_; +} + +} // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_allocator.h b/mace/core/runtime/hexagon/hexagon_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..d4143996d424141335032eb31abb1efe91134c4a --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_allocator.h @@ -0,0 +1,38 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_ + +#include "mace/core/allocator.h" + +namespace mace { + +class HexagonAllocator : public Allocator { + public: + MaceStatus New(size_t nbytes, void **result) override; + + void Delete(void *buffer) override; + + bool OnHost() const override; + + Rpcmem *rpcmem() override; + + private: + Rpcmem rpcmem_; +}; + +} // namespace mace + +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_ALLOCATOR_H_ diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h index 0ab7e3f9e8cfa46a80e684d9fd66cc45b3bf6a12..801c095ddf4b20df0db3b6a7daa3bbc370f3ac35 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h @@ -25,27 +25,15 @@ #include "mace/public/mace.h" namespace mace { - struct InOutInfo { InOutInfo(const std::string &name, const std::vector &shape, - const DataType data_type, - const float scale, - const int32_t zero_point, - std::unique_ptr tensor_u8) - : name(name), - shape(shape), - data_type(data_type), - scale(scale), - zero_point(zero_point), - tensor_u8(std::move(tensor_u8)) {} + const DataType data_type) + : name(name), shape(shape), data_type(data_type) {} std::string name; std::vector shape; DataType data_type; - float scale; - int32_t zero_point; - std::unique_ptr tensor_u8; }; class HexagonControlWrapper { @@ -79,8 +67,6 @@ class HexagonControlWrapper { int nn_id_; - std::vector input_info_; - std::vector output_info_; int num_inputs_; int num_outputs_; diff --git a/mace/core/runtime/hexagon/hexagon_device.cc b/mace/core/runtime/hexagon/hexagon_device.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8a0aa4e49873a306dcfa795a2a96d351f534dc5 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_device.cc @@ -0,0 +1,71 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/hexagon/hexagon_device.h" + +namespace mace { + +HexagonDevice::HexagonDevice(DeviceType device_type, + utils::ThreadPool *thread_pool +#ifdef MACE_ENABLE_OPENCL + , std::unique_ptr gpu_device +#endif // MACE_ENABLE_OPENCL + ) + : CPUDevice(0, AFFINITY_NONE, thread_pool), + allocator_(make_unique()), + device_type_(device_type) +#ifdef MACE_ENABLE_OPENCL + , gpu_device_(std::move(gpu_device)) +#endif // MACE_ENABLE_OPENCL + {} + +#ifdef MACE_ENABLE_OPENCL +GPURuntime *HexagonDevice::gpu_runtime() { + return gpu_device_->gpu_runtime(); +} +#endif // MACE_ENABLE_OPENCL + +Allocator *HexagonDevice::allocator() { +#ifdef MACE_ENABLE_OPENCL + return gpu_device_->allocator(); +#else + return allocator_.get(); +#endif // MACE_ENABLE_OPENCL +} + +DeviceType HexagonDevice::device_type() const { + return device_type_; +} + +std::unique_ptr CreateHexagonControlWrapper( + Device *device) { + std::unique_ptr hexagon_controller; + auto device_type = device->device_type(); + switch (device_type) { +#ifdef MACE_ENABLE_HEXAGON + case HEXAGON: + hexagon_controller = make_unique(); + break; +#endif +#ifdef MACE_ENABLE_HTA + case HTA: + hexagon_controller = make_unique(device); + break; +#endif + default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type; + } + + return hexagon_controller; +} +} // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_device.h b/mace/core/runtime/hexagon/hexagon_device.h index b17b19e5469cb5bb01e42f9beecdba286d8454af..660965eb9f89c46a395b0087ea6f864304a2f668 100644 --- a/mace/core/runtime/hexagon/hexagon_device.h +++ b/mace/core/runtime/hexagon/hexagon_device.h @@ -19,6 +19,7 @@ #include #include "mace/core/device.h" +#include "mace/core/runtime/hexagon/hexagon_allocator.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" #ifdef MACE_ENABLE_HEXAGON #include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" @@ -26,44 +27,39 @@ #ifdef MACE_ENABLE_HTA #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h" #endif +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/gpu_device.h" +#include "mace/core/runtime/opencl/gpu_runtime.h" +#endif namespace mace { class HexagonDevice : public CPUDevice { public: - explicit HexagonDevice(DeviceType device_type, - utils::ThreadPool *thread_pool) - : CPUDevice(0, AFFINITY_NONE, thread_pool), - device_type_(device_type) {} + HexagonDevice(DeviceType device_type, +#ifdef MACE_ENABLE_OPENCL + utils::ThreadPool *thread_pool, + std::unique_ptr gpu_device); +#else + utils::ThreadPool *thread_pool); +#endif // MACE_ENABLE_OPENCL - DeviceType device_type() const override { - return device_type_; - }; +#ifdef MACE_ENABLE_OPENCL + GPURuntime *gpu_runtime() override; +#endif // MACE_ENABLE_OPENCL + Allocator *allocator() override; + DeviceType device_type() const override; private: + std::unique_ptr allocator_; DeviceType device_type_; +#ifdef MACE_ENABLE_OPENCL + std::unique_ptr gpu_device_; +#endif // MACE_ENABLE_OPENCL }; std::unique_ptr CreateHexagonControlWrapper( - Device *device) { - std::unique_ptr hexagon_controller; - auto device_type = device->device_type(); - switch (device_type) { -#ifdef MACE_ENABLE_HEXAGON - case HEXAGON: - hexagon_controller = make_unique(); - break; -#endif -#ifdef MACE_ENABLE_HTA - case HTA: - hexagon_controller = make_unique(device); - break; -#endif - default:LOG(FATAL) << "Not supported Hexagon device type: " << device_type; - } - - return hexagon_controller; -} + Device *device); } // namespace mace #endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_DEVICE_H_ diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc index 9d17ab44c18f406c86d1c23fcd60ca22041cb462..dedff7f908daf2afadca967b01517333b2314a75 100644 --- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc @@ -107,8 +107,6 @@ hexagon_nn_corner_type TransformCornerType(HexagonNNCornerType corner) { } // namespace HexagonDSPWrapper::HexagonDSPWrapper() { - hexnn_controller_init(); - std::string env_log_execute_time_str; GetEnv("MACE_DSP_LOG_EXECUTE_TIME", &env_log_execute_time_str); if (env_log_execute_time_str.empty()) { @@ -119,9 +117,7 @@ HexagonDSPWrapper::HexagonDSPWrapper() { } } -HexagonDSPWrapper::~HexagonDSPWrapper() { - hexnn_controller_deinit(); -} +HexagonDSPWrapper::~HexagonDSPWrapper() {} int HexagonDSPWrapper::GetVersion() { int version; @@ -258,10 +254,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def, } input_info_.emplace_back(input_info.name(), input_shape, - input_info.data_type(), - input_info.scale(), - input_info.zero_point(), - make_unique()); + input_info.data_type()); } // output info @@ -275,10 +268,7 @@ bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def, } output_info_.emplace_back(output_info.name(), output_shape, - output_info.data_type(), - output_info.scale(), - output_info.zero_point(), - make_unique()); + output_info.data_type()); VLOG(1) << "OutputInfo: " << "\n\t shape: " << output_shape[0] << " " << output_shape[1] << " " << output_shape[2] << " " << output_shape[3] diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h index 831d163f8ac02ac692f4edc3a3d0ff5c402a6bf6..2ec98e7ab6e590b8d44fbdb9c18e4da20cb66a8a 100644 --- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h @@ -56,7 +56,8 @@ class HexagonDSPWrapper : public HexagonControlWrapper { uint64_t GetLastExecuteCycles(); bool log_execute_time_; - + std::vector input_info_; + std::vector output_info_; MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper); }; } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_hta_transformer.cc b/mace/core/runtime/hexagon/hexagon_hta_transformer.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf0b753122bf1624dc7e37c59dfe090204d00d41 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.cc @@ -0,0 +1,644 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h" + +#include +#include +#include +#include + +#include "mace/core/op_context.h" +#include "mace/core/quantize.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/utils/math.h" + +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_helper.h" +#endif // MACE_ENABLE_OPENCL + +namespace mace { +namespace { +template +class QuantizeTransformer; + +template <> +class QuantizeTransformer : public BaseTransformer { + public: + void Init(Device *device) override { + device_ = device; + quantize_util_.Init(&device_->cpu_runtime()->thread_pool()); + } + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "Quantize on CPU"); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + output->SetScale(input->scale()); + output->SetZeroPoint(input->zero_point()); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + quantize_util_.QuantizeWithScaleAndZeropoint( + input_data, input->size(), input->scale(), input->zero_point(), + output_data); + return MaceStatus::MACE_SUCCESS; + } + + private: + QuantizeUtil quantize_util_; +}; + +#ifdef MACE_ENABLE_OPENCL +template <> +class QuantizeTransformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "Quantize on GPU"); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + output->SetScale(input->scale()); + output->SetZeroPoint(input->zero_point()); + const uint32_t gws = static_cast(RoundUpDiv4(output->size())); + OpenCLRuntime *runtime = device_->gpu_runtime()->opencl_runtime(); + if (kernel_.get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("buffer_quantize"); + built_options.emplace("-Dbuffer_quantize=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype())); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, + built_options, &kernel_)); + } + + uint32_t idx = 0; + kernel_.setArg(idx++, gws); + kernel_.setArg(idx++, input->scale()); + kernel_.setArg(idx++, input->zero_point()); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel_.setArg(idx++, *(output->opencl_buffer())); + + const uint32_t lws = static_cast( + RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize())); + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws), cl::NDRange(lws), nullptr, + &event); + } else { + uint32_t roundup_gws = RoundUp(gws, lws); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(roundup_gws), cl::NDRange(lws), + nullptr, &event); + } + MACE_CL_RET_STATUS(error); + return MaceStatus::MACE_SUCCESS; + } + + private: + cl::Kernel kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +template +class DequantizeTransformer; + +template <> +class DequantizeTransformer : public BaseTransformer { + public: + void Init(Device *device) override { + device_ = device; + quantize_util_.Init(&device_->cpu_runtime()->thread_pool()); + } + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "Dequantize on CPU"); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + output->SetScale(input->scale()); + output->SetZeroPoint(input->zero_point()); + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + quantize_util_.Dequantize(input_data, input->size(), input->scale(), + input->zero_point(), output_data); + return MaceStatus::MACE_SUCCESS; + } + + private: + QuantizeUtil quantize_util_; +}; + +#ifdef MACE_ENABLE_OPENCL +template <> +class DequantizeTransformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "Dequantize on GPU"); + MACE_RETURN_IF_ERROR(output->ResizeLike(input)); + output->SetScale(input->scale()); + output->SetZeroPoint(input->zero_point()); + const uint32_t gws = static_cast(RoundUpDiv4(output->size())); + OpenCLRuntime *runtime = device_->gpu_runtime()->opencl_runtime(); + if (kernel_.get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("buffer_dequantize"); + built_options.emplace("-Dbuffer_dequantize=" + kernel_name); + built_options.emplace("-DIN_DATA_TYPE=" + DtToCLDt(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(output->dtype())); + MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_transform", kernel_name, + built_options, &kernel_)); + } + + uint32_t idx = 0; + kernel_.setArg(idx++, gws); + kernel_.setArg(idx++, input->scale()); + kernel_.setArg(idx++, input->zero_point()); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel_.setArg(idx++, *(output->opencl_buffer())); + + const uint32_t lws = static_cast( + RoundUpDiv4(runtime->GetDeviceMaxWorkGroupSize())); + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws), cl::NDRange(lws), nullptr, + &event); + } else { + uint32_t roundup_gws = RoundUp(gws, lws); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(roundup_gws), cl::NDRange(lws), + nullptr, &event); + } + MACE_CL_RET_STATUS(error); + return MaceStatus::MACE_SUCCESS; + } + + private: + cl::Kernel kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +template +class NHWCToNCHW32Transformer; +template <> +class NHWCToNCHW32Transformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "NHWCToNCHW32Transformer on CPU"); + int32_t padding_value = input->zero_point(); + index_t batch = input->dim(0); + index_t height = input->dim(1); + index_t width = input->dim(2); + index_t channels = input->dim(3); + index_t height_stride = width * channels; + index_t batch_stride = height * width * channels; + + index_t output_width = RoundUp(width, 32); + index_t output_channel_stride = height * output_width; + index_t output_batch_stride = channels * height * output_width; + + output->Resize({batch, channels, height, output_width}); + + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard output_mapper(output); + const auto input_data = input->data(); + auto output_data = output->mutable_data(); + + device_->cpu_runtime()->thread_pool().Compute2D( + [=](index_t start0, index_t end0, index_t step0, index_t start1, + index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t c = start1; c < end1; c += step1) { + index_t input_offset = b * batch_stride + c; + index_t output_offset = + b * output_batch_stride + c * output_channel_stride; + for (index_t h = 0; h < height; ++h) { + for (index_t w = 0; w < width; ++w) { + output_data[output_offset + w] = + input_data[input_offset + w * channels]; + } + std::fill_n(output_data + output_offset + width, + output_width - width, padding_value); + input_offset += height_stride; + output_offset += output_width; + } + } + } + }, + 0, batch, 1, 0, channels, 1); + + return MaceStatus::MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template <> +class NHWCToNCHW32Transformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "NHWCToNCHW32Transformer on GPU"); + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + const index_t output_width = RoundUp(width, 32); + std::vector transformed_shape = {batch, channels, height, + output_width}; + uint32_t gws[3]; + gws[0] = static_cast(RoundUpDiv4(output_width)); + gws[1] = static_cast(height); + gws[2] = static_cast(batch * channels); + MACE_RETURN_IF_ERROR(output->Resize(transformed_shape)); + + if (kernel_.get() == nullptr) { + std::set built_options; + std::string kernel_name = + MACE_OBFUSCATE_SYMBOL("transform_nhwc_to_nchw32"); + built_options.emplace("-Dtransform_nhwc_to_nchw32=" + kernel_name); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); + MACE_RETURN_IF_ERROR( + device_->gpu_runtime()->opencl_runtime()->BuildKernel( + "buffer_transform", kernel_name, built_options, &kernel_)); + } + uint32_t idx = 0; + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel_.setArg(idx++, input->zero_point()); + kernel_.setArg(idx++, *(output->opencl_buffer())); + kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + + std::string tuning_key = Concat("transform_nhwc_to_nchw32", + transformed_shape[0], transformed_shape[1], + transformed_shape[2], transformed_shape[3]); + std::vector lws = {4, 4, 4, 0}; + MACE_RETURN_IF_ERROR( + TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_, + tuning_key, gws, lws, nullptr)); + + return MaceStatus::MACE_SUCCESS; + } + + private: + cl::Kernel kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +template +class NHWCToD32Transformer; + +template <> +class NHWCToD32Transformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "NHWCToD32Transformer on CPU"); + index_t batch = input->dim(0); + index_t height = input->dim(1); + index_t width = input->dim(2); + index_t channels = input->dim(3); + index_t height_stride = width * channels; + index_t batch_stride = height * width * channels; + + index_t channel_slices = RoundUpDiv(channels, static_cast(32)); + index_t output_channel_slices_stride = width * 32; + index_t output_height_stride = channel_slices * width * 32; + index_t output_batch_stride = height * channel_slices * width * 32; + + std::vector output_shape{batch, height, channel_slices, width, 32}; + output->Resize(output_shape); + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + std::fill_n(output_data, output->size(), input->zero_point()); + + device_->cpu_runtime()->thread_pool().Compute2D( + [=](index_t start0, index_t end0, index_t step0, index_t start1, + index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + index_t input_offset = b * batch_stride + h * height_stride; + index_t output_offset = + b * output_batch_stride + h * output_height_stride; + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < channels; ++c) { + output_data[output_offset + + c / 32 * output_channel_slices_stride + c % 32] = + input_data[input_offset + c]; + } + input_offset += channels; + output_offset += 32; + } + } + } + }, + 0, batch, 1, 0, height, 1); + return MaceStatus::MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template <> +class NHWCToD32Transformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on GPU"); + const index_t batch = input->dim(0); + const index_t height = input->dim(1); + const index_t width = input->dim(2); + const index_t channels = input->dim(3); + const index_t channel_slices = RoundUpDiv(channels, 32); + std::vector output_shape{batch, height, channel_slices, width, 32}; + output->Resize(output_shape); + + uint32_t gws[3]; + gws[0] = static_cast(RoundUpDiv4(width * 32)); + gws[1] = static_cast(channel_slices); + gws[2] = static_cast(batch * height); + + if (kernel_.get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_nhwc_to_d32"); + built_options.emplace("-Dtransform_nhwc_to_d32=" + kernel_name); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); + MACE_RETURN_IF_ERROR( + device_->gpu_runtime()->opencl_runtime()->BuildKernel( + "buffer_transform", kernel_name, built_options, &kernel_)); + } + + uint32_t idx = 0; + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel_.setArg(idx++, input->zero_point()); + kernel_.setArg(idx++, *(output->opencl_buffer())); + kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(channel_slices)); + + + std::string tuning_key = + Concat("transform_nhwc_to_d32", batch, height, width, channels); + std::vector lws = {4, 4, 4, 0}; + MACE_RETURN_IF_ERROR( + TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_, + tuning_key, gws, lws, nullptr)); + return MaceStatus::MACE_SUCCESS; + } + + private: + cl::Kernel kernel_; +}; +#endif // MACE_ENABLE_OPENCL + +template +class D32ToNHWCTransformer; + +template <> +class D32ToNHWCTransformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on CPU"); + index_t batch = output->dim(0); + index_t height = output->dim(1); + index_t width = output->dim(2); + index_t channel = output->dim(3); + index_t height_stride = width * channel; + index_t batch_stride = height * width * channel; + + index_t channel_slices = RoundUpDiv(channel, static_cast(32)); + index_t input_channel_slices_stride = width * 32; + index_t input_height_stride = channel_slices * width * 32; + index_t input_batch_stride = height * channel_slices * width * 32; + + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + auto input_data = input->data(); + auto output_data = output->mutable_data(); + + device_->cpu_runtime()->thread_pool().Compute2D( + [=](index_t start0, index_t end0, index_t step0, index_t start1, + index_t end1, index_t step1) { + for (index_t b = start0; b < end0; b += step0) { + for (index_t h = start1; h < end1; h += step1) { + index_t input_offset = + b * input_batch_stride + h * input_height_stride; + index_t output_offset = b * batch_stride + h * height_stride; + for (index_t w = 0; w < width; ++w) { + for (index_t c = 0; c < channel; ++c) { + output_data[output_offset + c] = + input_data[input_offset + + c / 32 * input_channel_slices_stride + c % 32]; + } + input_offset += 32; + output_offset += channel; + } + } + } + }, + 0, batch, 1, 0, height, 1); + return MaceStatus::MACE_SUCCESS; + } +}; + +#ifdef MACE_ENABLE_OPENCL +template <> +class D32ToNHWCTransformer : public BaseTransformer { + public: + MaceStatus Compute(const Tensor *input, Tensor *output) override { + MACE_LATENCY_LOGGER(1, "D32ToNHWCTransformer on GPU"); + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t channel_slices = RoundUpDiv(channels, 32); + + uint32_t gws[3]; + gws[0] = static_cast(RoundUpDiv4(channels)); + gws[1] = static_cast(width); + gws[2] = static_cast(batch * height); + + if (kernel_.get() == nullptr) { + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("transform_d32_to_nhwc"); + built_options.emplace("-Dtransform_d32_to_nhwc=" + kernel_name); + std::string data_dt = DtToCLDt(input->dtype()); + built_options.emplace("-DIN_DATA_TYPE=" + data_dt); + built_options.emplace("-DDATA_TYPE=" + data_dt); + MACE_RETURN_IF_ERROR( + device_->gpu_runtime()->opencl_runtime()->BuildKernel( + "buffer_transform", kernel_name, built_options, &kernel_)); + } + + uint32_t idx = 0; + MACE_SET_3D_GWS_ARGS(kernel_, gws); + kernel_.setArg(idx++, *(input->opencl_buffer())); + MACE_CHECK(input->buffer_offset() % GetEnumTypeSize(input->dtype()) == 0, + "buffer offset not aligned"); + kernel_.setArg(idx++, + static_cast(input->buffer_offset() / + GetEnumTypeSize(input->dtype()))); + kernel_.setArg(idx++, *(output->opencl_buffer())); + kernel_.setArg(idx++, static_cast(batch)); + kernel_.setArg(idx++, static_cast(height)); + kernel_.setArg(idx++, static_cast(width)); + kernel_.setArg(idx++, static_cast(channels)); + kernel_.setArg(idx++, static_cast(channel_slices)); + + + std::string tuning_key = + Concat("transform_d32_to_nhwc", batch, height, width, channels); + std::vector lws = {4, 4, 4, 0}; + MACE_RETURN_IF_ERROR( + TuningOrRun3DKernel(device_->gpu_runtime()->opencl_runtime(), kernel_, + tuning_key, gws, lws, nullptr)); + return MaceStatus::MACE_SUCCESS; + } + + private: + cl::Kernel kernel_; +}; +#endif // MACE_ENABLE_OPENCL +} // namespace + +template +void HexagonHTATranformer::Init(Device *device) { + device_ = device; + quantizer_ = make_unique>(); + quantizer_->Init(device); + dequantizer_ = make_unique>(); + dequantizer_->Init(device); +} + +template +MaceStatus HexagonHTATranformer::SetInputTransformer( + hexagon_hta_hw_layout format) { + switch (format) { + case HEXAGON_HTA_HW_FORMAT_D32: + input_transformers_.push_back(make_unique>()); + break; + case HEXAGON_HTA_HW_FORMAT_PLANAR: + input_transformers_.push_back(make_unique>()); + break; + case HEXAGON_HTA_HW_FORMAT_DEPTH_FIRST: + default: + MACE_NOT_IMPLEMENTED; + break; + } + input_transformers_.back()->Init(device_); + return MaceStatus::MACE_SUCCESS; +} + +template +MaceStatus HexagonHTATranformer::SetOutputTransformer( + hexagon_hta_hw_layout format) { + switch (format) { + case HEXAGON_HTA_HW_FORMAT_D32: + output_transformers_.push_back(make_unique>()); + break; + case HEXAGON_HTA_HW_FORMAT_PLANAR: + case HEXAGON_HTA_HW_FORMAT_DEPTH_FIRST: + default: + MACE_NOT_IMPLEMENTED; + break; + } + output_transformers_.back()->Init(device_); + return MaceStatus::MACE_SUCCESS; +} + +template +MaceStatus HexagonHTATranformer::TransformInput(const Tensor *input, + Tensor *output, + int index) { + return input_transformers_[index]->Compute(input, output); +} + +template +MaceStatus HexagonHTATranformer::TransformOutput(const Tensor *input, + Tensor *output, + int index) { + return output_transformers_[index]->Compute(input, output); +} + +template +MaceStatus HexagonHTATranformer::Quantize(const Tensor *input, + Tensor *output) { + return quantizer_->Compute(input, output); +} + +template +MaceStatus HexagonHTATranformer::Dequantize(const Tensor *input, + Tensor *output) { + return dequantizer_->Compute(input, output); +} + +template void HexagonHTATranformer::Init(Device *device); +template MaceStatus HexagonHTATranformer::Quantize(const Tensor *input, + Tensor *output); +template MaceStatus HexagonHTATranformer::Dequantize(const Tensor *input, + Tensor *output); +template MaceStatus HexagonHTATranformer::SetInputTransformer( + hexagon_hta_hw_layout format); +template MaceStatus HexagonHTATranformer::SetOutputTransformer( + hexagon_hta_hw_layout format); +template MaceStatus HexagonHTATranformer::TransformInput( + const Tensor *input, Tensor *output, int index); +template MaceStatus HexagonHTATranformer::TransformOutput( + const Tensor *input, Tensor *output, int index); + +#ifdef MACE_ENABLE_OPENCL +template void HexagonHTATranformer::Init(Device *device); +template MaceStatus HexagonHTATranformer::Quantize(const Tensor *input, + Tensor *output); +template MaceStatus HexagonHTATranformer::Dequantize(const Tensor *input, + Tensor *output); +template MaceStatus HexagonHTATranformer::SetInputTransformer( + hexagon_hta_hw_layout format); +template MaceStatus HexagonHTATranformer::SetOutputTransformer( + hexagon_hta_hw_layout format); +template MaceStatus HexagonHTATranformer::TransformInput( + const Tensor *input, Tensor *output, int index); +template MaceStatus HexagonHTATranformer::TransformOutput( + const Tensor *input, Tensor *output, int index); +#endif // MACE_ENABLE_OPENCL +} // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_hta_transformer.h b/mace/core/runtime/hexagon/hexagon_hta_transformer.h new file mode 100644 index 0000000000000000000000000000000000000000..74369a4b3ff9634c82d4f0680e7180a6db6c4ec4 --- /dev/null +++ b/mace/core/runtime/hexagon/hexagon_hta_transformer.h @@ -0,0 +1,85 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_ +#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_ + +#include +#include + +#include "mace/core/device.h" +#include "mace/core/tensor.h" +#include "mace/core/types.h" +#include "mace/utils/math.h" +#include "mace/utils/thread_pool.h" +#include "third_party/hta/hta_hexagon_api.h" + +namespace mace { +class BaseTransformer { + public: + BaseTransformer() = default; + virtual ~BaseTransformer() = default; + + virtual void Init(Device *device) { device_ = device; } + virtual MaceStatus Compute(const Tensor *input, Tensor *output) = 0; + + protected: + Device *device_; +}; + +class HexagonHTATranformerBase { + public: + HexagonHTATranformerBase() = default; + virtual ~HexagonHTATranformerBase() = default; + + virtual void Init(Device *device) = 0; + virtual MaceStatus SetInputTransformer( + const hexagon_hta_hw_layout format) = 0; + virtual MaceStatus SetOutputTransformer( + const hexagon_hta_hw_layout format) = 0; + virtual MaceStatus Quantize(const Tensor *input, Tensor *output) = 0; + virtual MaceStatus Dequantize(const Tensor *input, Tensor *output) = 0; + virtual MaceStatus TransformInput(const Tensor *input, + Tensor *output, + int index) = 0; + virtual MaceStatus TransformOutput(const Tensor *input, + Tensor *output, + int index) = 0; +}; + +template +class HexagonHTATranformer : public HexagonHTATranformerBase { + public: + void Init(Device *device) override; + MaceStatus SetInputTransformer(const hexagon_hta_hw_layout format) override; + MaceStatus SetOutputTransformer(const hexagon_hta_hw_layout format) override; + + MaceStatus Quantize(const Tensor *input, Tensor *output) override; + MaceStatus Dequantize(const Tensor *input, Tensor *output) override; + MaceStatus TransformInput(const Tensor *input, + Tensor *output, + int index) override; + MaceStatus TransformOutput(const Tensor *input, + Tensor *output, + int index) override; + + private: + Device *device_; + std::unique_ptr quantizer_; + std::unique_ptr dequantizer_; + std::vector> input_transformers_; + std::vector> output_transformers_; +}; +} // namespace mace +#endif // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_TRANSFORMER_H_ diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc index 069eab30c923053f1effdf1ee3e38ae5df0e2fe8..24a5eb82f015b8a66203d2071b1a1190b2aa4b4b 100644 --- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "mace/core/runtime/hexagon/hexagon_hta_wrapper.h" - +#include #include #include #include @@ -24,48 +24,59 @@ #include #include "mace/core/runtime/hexagon/hexagon_hta_ops.h" +#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h" #include "mace/core/types.h" #include "mace/utils/memory.h" -#include "mace/core/quantize.h" #include "third_party/hta/hta_hexagon_api.h" - namespace mace { namespace { -struct InputOutputMetadata { - void Init(float min_val, float max_val, int needs_quantization) { - this->min_val = min_val; - this->max_val = max_val; - this->needs_quantization = needs_quantization; +int GetHTAEnv(const std::string &name, int default_value) { + int value = default_value; + std::string env_str; + MaceStatus status = GetEnv(name.c_str(), &env_str); + if (status == MaceStatus::MACE_SUCCESS && !env_str.empty()) { + value = std::atoi(env_str.c_str()); } - float min_val; - float max_val; - int needs_quantization; -}; - -template -void AddInputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) { - tensor->batches = 1; - tensor->height = 1; - tensor->width = 1; - tensor->depth = 1; - tensor->data = const_cast( - reinterpret_cast(&data)); - tensor->dataLen = sizeof(data); - tensor->data_valid_len = sizeof(data); - tensor->unused = 0; + return value; } -template -void AddOutputMetadata(const T &data, hexagon_hta_nn_tensordef *tensor) { - tensor->data = const_cast( - reinterpret_cast(&data)); - tensor->dataLen = sizeof(data); +// Print the API logs to standard output. +void HtaApiLog(hexagon_hta_nn_nn_id id, + const char *api_op, + const char *const format, + ...) { + va_list arg; + va_start(arg, format); + if (api_op != NULL) { + printf("Graph ID: %d\t", id); + } + vfprintf(stdout, format, arg); + va_end(arg); +} +// Print the performance stats to standard output. +void HtaPerformanceLog(int log_level, + uint32_t network_handle, + uint32_t thread_id, + const char *const format, + ...) { + va_list arg; + va_start(arg, format); + printf("Log Level: %d, Network Handle: %d, Thread ID: %d - ", log_level, + network_handle, thread_id); + vfprintf(stdout, format, arg); + va_end(arg); } } // namespace HexagonHTAWrapper::HexagonHTAWrapper(Device *device) - : quantize_util_(&device->cpu_runtime()->thread_pool()) { + : allocator_(device->allocator()), +#ifdef MACE_ENABLE_OPENCL + transformer_(make_unique>()) { +#else + transformer_(make_unique>()) { +#endif + transformer_->Init(device); } int HexagonHTAWrapper::GetVersion() { @@ -84,6 +95,40 @@ bool HexagonHTAWrapper::Init() { LOG(INFO) << "Hexagon init"; MACE_CHECK(hexagon_hta_nn_init(&nn_id_) == 0, "hexagon_nn_init failed"); ResetPerfInfo(); + + int ret; + int power_level = GetHTAEnv("MACE_HTA_POWER_LEVEL", -1); + if (power_level != -1) { + ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_POWER_LEVEL, + &power_level, sizeof(power_level)); + LOG(INFO) << "HTA_NN_CONFIG_POWER_LEVEL: " << power_level + << " returns: " << ret; + } + + int is_compress = GetHTAEnv("MACE_HTA_BANDWIDTH_COMPRESSION", 1); + if (is_compress) { + ret = hexagon_hta_nn_set_config_params(nn_id_, + HTA_NN_CONFIG_BANDWIDTH_COMPRESSION, + &is_compress, sizeof(is_compress)); + LOG(INFO) << "HTA_NN_CONFIG_BANDWIDTH_COMPRESSION: " << is_compress + << " returns: " << ret; + } + + if (VLOG_IS_ON(2)) { + ret = hexagon_hta_nn_set_config_params( + nn_id_, HTA_NN_CONFIG_PERFORMANCE_LOG, + reinterpret_cast(&HtaPerformanceLog), + sizeof(&HtaPerformanceLog)); + MACE_CHECK(ret == 0, "HTA_NN_CONFIG_PERFORMANCE_LOG returns: " , ret); + } + + if (VLOG_IS_ON(3)) { + ret = hexagon_hta_nn_set_config_params(nn_id_, HTA_NN_CONFIG_API_LOG, + reinterpret_cast(&HtaApiLog), + sizeof(&HtaApiLog)); + MACE_CHECK(ret == 0, "HTA_NN_CONFIG_API_LOG returns: ", ret); + } + return true; } @@ -172,52 +217,107 @@ bool HexagonHTAWrapper::SetupGraph(const NetDef &net_def, outputs.size()); } + int64_t t1 = NowMicros(); + + MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed"); + + int64_t t2 = NowMicros(); + + VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1; + // input info num_inputs_ = net_def.input_info_size(); input_info_.reserve(num_inputs_); - for (const InputOutputInfo &input_info : net_def.input_info()) { + input_tensordef_.resize(num_inputs_); + for (int index = 0; index < num_inputs_; ++index) { + auto input_info = net_def.input_info(index); std::vector input_shape(input_info.dims().begin(), input_info.dims().end()); while (input_shape.size() < 4) { input_shape.insert(input_shape.begin(), 1); } - input_info_.emplace_back(input_info.name(), - input_shape, - input_info.data_type(), - input_info.scale(), - input_info.zero_point(), - make_unique()); + + auto quantized_tensor = make_unique(allocator_, DT_UINT8); + auto hta_tensor = make_unique(allocator_, DT_UINT8); + hexagon_hta_nn_hw_tensordef &input_tensordef = input_tensordef_[index]; + memset(&input_tensordef, 0, sizeof(input_tensordef)); + MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 0, index, + &input_tensordef) == 0); + input_tensordef.dataLen = input_tensordef.batchStride; + VLOG(1) << input_tensordef.format << " " << input_tensordef.elementSize + << " " << input_tensordef.numDims << " " + << input_tensordef.batchStride; + for (uint32_t i = 0; i < input_tensordef.numDims; ++i) { + VLOG(1) << input_tensordef.dim[i].length << " " + << input_tensordef.dim[i].lpadding << " " + << input_tensordef.dim[i].valid; + } + hta_tensor->Resize({input_tensordef.dataLen}); + MACE_CHECK(hta_tensor->raw_size() == input_tensordef.dataLen); + Tensor::MappingGuard input_guard(hta_tensor.get()); + input_tensordef.fd = + allocator_->rpcmem()->ToFd(hta_tensor->mutable_data()); + MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &input_tensordef) == 0); + + transformer_->SetInputTransformer(input_tensordef.format); + + input_info_.emplace_back( + input_info.name(), input_shape, input_info.data_type(), + input_info.scale(), input_info.zero_point(), + std::move(quantized_tensor), + std::move(hta_tensor)); } // output info num_outputs_ = net_def.output_info_size(); output_info_.reserve(num_outputs_); - for (const InputOutputInfo &output_info : net_def.output_info()) { + output_tensordef_.resize(num_outputs_); + for (int index = 0; index < num_outputs_; ++index) { + auto output_info = net_def.output_info(index); std::vector output_shape(output_info.dims().begin(), output_info.dims().end()); while (output_shape.size() < 4) { output_shape.insert(output_shape.begin(), 1); } - output_info_.emplace_back(output_info.name(), - output_shape, - output_info.data_type(), - output_info.scale(), - output_info.zero_point(), - make_unique()); + + auto quantized_tensor = make_unique(allocator_, DT_UINT8); + auto hta_tensor = make_unique(allocator_, DT_UINT8); + quantized_tensor->SetScale(output_info.scale()); + quantized_tensor->SetZeroPoint(output_info.zero_point()); + + hexagon_hta_nn_hw_tensordef &output_tensordef = output_tensordef_[index]; + memset(&output_tensordef, 0, sizeof(output_tensordef)); + MACE_CHECK(hexagon_hta_nn_get_memory_layout(nn_id_, 1, index, + &output_tensordef) == 0); + output_tensordef.dataLen = output_tensordef.batchStride; + VLOG(1) << output_tensordef.format << " " << output_tensordef.elementSize + << " " << output_tensordef.numDims << " " + << output_tensordef.batchStride; + for (uint32_t i = 0; i < output_tensordef.numDims; ++i) { + VLOG(1) << output_tensordef.dim[i].length << " " + << output_tensordef.dim[i].lpadding << " " + << output_tensordef.dim[i].valid; + } + hta_tensor->Resize({output_tensordef.batchStride}); + MACE_CHECK(hta_tensor->raw_size() == output_tensordef.dataLen); + Tensor::MappingGuard output_guard(hta_tensor.get()); + output_tensordef.fd = + allocator_->rpcmem()->ToFd(hta_tensor->mutable_data()); + MACE_CHECK(hexagon_hta_nn_register_tensor(nn_id_, &output_tensordef) == 0); + + transformer_->SetOutputTransformer(output_tensordef.format); + + output_info_.emplace_back( + output_info.name(), output_shape, output_info.data_type(), + output_info.scale(), output_info.zero_point(), + std::move(quantized_tensor), std::move(hta_tensor)); + VLOG(1) << "OutputInfo: " << "\n\t shape: " << output_shape[0] << " " << output_shape[1] << " " << output_shape[2] << " " << output_shape[3] << "\n\t type: " << output_info.data_type(); } - int64_t t1 = NowMicros(); - - MACE_CHECK(hexagon_hta_nn_prepare(nn_id_) == 0, "hexagon_nn_prepare failed"); - - int64_t t2 = NowMicros(); - - VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1; - return true; } @@ -266,78 +366,41 @@ bool HexagonHTAWrapper::ExecuteGraphNew( MACE_CHECK(num_outputs_ == static_cast(num_outputs), "Wrong outputs num"); - std::vector inputs(num_inputs * kNumMetaData); - std::vector outputs(num_outputs * kNumMetaData); - std::vector input_metadata(num_inputs); - std::vector output_metadata(num_outputs); - for (size_t i = 0; i < num_inputs; ++i) { const auto input_tensor = input_tensors.at(input_info_[i].name); - const auto &input_shape = input_tensor->shape(); - size_t index = i * kNumMetaData; - inputs[index].batches = static_cast(input_shape[0]); - inputs[index].height = static_cast(input_shape[1]); - inputs[index].width = static_cast(input_shape[2]); - inputs[index].depth = static_cast(input_shape[3]); - inputs[index].data = const_cast( - reinterpret_cast(input_tensor->raw_data())); - inputs[index].dataLen = static_cast(input_tensor->raw_size()); - inputs[index].data_valid_len = - static_cast(input_tensor->raw_size()); - inputs[index].unused = 0; - input_metadata[i].Init(.0f, .0f, 1); - AddInputMetadata(input_metadata[i].min_val, &inputs[index + 1]); - AddInputMetadata(input_metadata[i].max_val, &inputs[index + 2]); - AddInputMetadata(input_metadata[i].needs_quantization, &inputs[index + 3]); - } + input_tensor->SetScale(input_info_[i].scale); + input_tensor->SetZeroPoint(input_info_[i].zero_point); + MACE_CHECK_SUCCESS( + transformer_->Quantize(input_tensors.at(input_info_[i].name), + input_info_[i].quantized_tensor.get())); - // transform mace output to hexagon output - for (size_t i = 0; i < num_outputs; ++i) { - auto output_tensor = output_tensors->at(output_info_[i].name); - size_t index = i * kNumMetaData; - output_tensor->SetDtype(output_info_[i].data_type); - output_tensor->Resize(output_info_[i].shape); - - outputs[index].data = reinterpret_cast( - output_tensor->raw_mutable_data()); - outputs[index].dataLen = static_cast(output_tensor->raw_size()); - output_metadata[i].Init(.0f, .0f, 1); - - AddOutputMetadata(output_metadata[i].min_val, &outputs[index + 1]); - AddOutputMetadata(output_metadata[i].max_val, &outputs[index + 2]); - AddOutputMetadata(output_metadata[i].needs_quantization, - &outputs[index + 3]); + MACE_CHECK_SUCCESS(transformer_->TransformInput( + input_info_[i].quantized_tensor.get(), + input_info_[i].hta_tensor.get(), i)); + + Tensor::MappingGuard input_guard(input_info_[i].hta_tensor.get()); } - int res = hexagon_hta_nn_execute_new(nn_id_, - inputs.data(), - num_inputs * kNumMetaData, - outputs.data(), - num_outputs * kNumMetaData); - MACE_CHECK(res == 0, "execute error"); + MACE_CHECK(hexagon_hta_nn_execute_hw(nn_id_, + input_tensordef_.data(), num_inputs, + output_tensordef_.data(), num_outputs, + nullptr, nullptr) == 0); for (size_t i = 0; i < num_outputs; ++i) { - size_t index = i * kNumMetaData; - std::vector output_shape{ - outputs[index].batches, outputs[index].height, outputs[index].width, - outputs[index].depth}; - MACE_CHECK(output_shape.size() == output_info_[i].shape.size(), - output_shape.size(), " vs ", output_info_[i].shape.size(), - " wrong output shape inferred"); - for (size_t j = 0; j < output_shape.size(); ++j) { - MACE_CHECK(static_cast(output_shape[j]) - == output_info_[i].shape[j], - output_shape[j], " vs ", output_info_[i].shape[j], - " wrong output shape[", j, "] inferred"); + { // To sync cache + Tensor::MappingGuard output_guard(output_info_[i].hta_tensor.get()); } + output_info_[i].quantized_tensor->Resize(output_info_[i].shape); + transformer_->TransformOutput(output_info_[i].hta_tensor.get(), + output_info_[i].quantized_tensor.get(), i); + + auto output_tensor = output_tensors->at(output_info_[i].name); - MACE_CHECK(static_cast(outputs[index].data_valid_len) - == output_tensor->raw_size(), - outputs[index].data_valid_len, " vs ", output_tensor->raw_size(), - " wrong output bytes inferred."); + MaceStatus st = transformer_->Dequantize( + output_info_[i].quantized_tensor.get(), output_tensor); } - return res == 0; + return true; } } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h index 6b33514ccc8b1d628c856ca04c8b26f8d0af9006..aabcdc61c6aec244fcb1858d5c80f6b016b3e88b 100644 --- a/mace/core/runtime/hexagon/hexagon_hta_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_hta_wrapper.h @@ -16,18 +16,40 @@ #define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_HTA_WRAPPER_H_ #include +#include #include #include +#include -#include "mace/utils/thread_pool.h" -#include "mace/core/quantize.h" +#include "mace/core/device.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h" #include "mace/core/tensor.h" -#include "mace/core/device.h" #include "mace/public/mace.h" +#include "third_party/hta/hta_hexagon_api.h" namespace mace { +struct HTAInOutInfo : public InOutInfo { + HTAInOutInfo(const std::string &name, + const std::vector &shape, + const DataType data_type, + const float scale, + const int32_t zero_point, + std::unique_ptr quantized_tensor, + std::unique_ptr hta_tensor) + : InOutInfo(name, shape, data_type), + scale(scale), + zero_point(zero_point), + quantized_tensor(std::move(quantized_tensor)), + hta_tensor(std::move(hta_tensor)) {} + + float scale; + int32_t zero_point; + std::unique_ptr quantized_tensor; + std::unique_ptr hta_tensor; +}; + class HexagonHTAWrapper : public HexagonControlWrapper { public: explicit HexagonHTAWrapper(Device *device); @@ -50,7 +72,12 @@ class HexagonHTAWrapper : public HexagonControlWrapper { void SetDebugLevel(int level) override; private: - QuantizeUtil quantize_util_; + Allocator *allocator_; + std::vector input_info_; + std::vector output_info_; + std::vector input_tensordef_; + std::vector output_tensordef_; + std::unique_ptr transformer_; MACE_DISABLE_COPY_AND_ASSIGN(HexagonHTAWrapper); }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index b0bf041b6883000057a63608a82761abfa2037a3..a0865958893885c247ec2d5addef7d308c4ea3b5 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -16,9 +16,6 @@ #include "mace/core/runtime/opencl/opencl_allocator.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#ifdef MACE_ENABLE_RPCMEM -#include "third_party/rpcmem/rpcmem.h" -#endif // MACE_ENABLE_RPCMEM namespace mace { namespace { @@ -39,24 +36,10 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { } } -#ifdef MACE_ENABLE_RPCMEM -std::once_flag ion_prepared; -void PrepareQualcommION() { - rpcmem_init(); - std::atexit(rpcmem_deinit); -} -#endif // MACE_ENABLE_RPCMEM - } // namespace OpenCLAllocator::OpenCLAllocator( - OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) { -#ifdef MACE_ENABLE_RPCMEM - if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) { - std::call_once(ion_prepared, PrepareQualcommION); - } -#endif // MACE_ENABLE_RPCMEM -} + OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {} OpenCLAllocator::~OpenCLAllocator() {} @@ -168,7 +151,7 @@ void OpenCLAllocator::Delete(void *buffer) { if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) { auto it = cl_to_host_map_.find(buffer); MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL buffer not found!"); - rpcmem_free(it->second); + rpcmem_.Delete(it->second); cl_to_host_map_.erase(buffer); } #endif // MACE_ENABLE_RPCMEM @@ -184,7 +167,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) { if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) { auto it = cl_to_host_map_.find(buffer); MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL image not found!"); - rpcmem_free(it->second); + rpcmem_.Delete(it->second); cl_to_host_map_.erase(buffer); } #endif // MACE_ENABLE_RPCMEM @@ -194,7 +177,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) { void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes, - bool finish_cmd_queue) const { + bool finish_cmd_queue) { MACE_LATENCY_LOGGER(1, "Map OpenCL buffer"); void *mapped_ptr = nullptr; #ifdef MACE_ENABLE_RPCMEM @@ -209,7 +192,7 @@ void *OpenCLAllocator::Map(void *buffer, if (opencl_runtime_->qcom_host_cache_policy() == CL_MEM_HOST_WRITEBACK_QCOM) { - MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0); + MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0); } } else { #endif // MACE_ENABLE_RPCMEM @@ -234,7 +217,7 @@ void *OpenCLAllocator::Map(void *buffer, void *OpenCLAllocator::MapImage(void *buffer, const std::vector &image_shape, std::vector *mapped_image_pitch, - bool finish_cmd_queue) const { + bool finish_cmd_queue) { MACE_LATENCY_LOGGER(1, "Map OpenCL Image"); MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; void *mapped_ptr = nullptr; @@ -251,7 +234,7 @@ void *OpenCLAllocator::MapImage(void *buffer, if (opencl_runtime_->qcom_host_cache_policy() == CL_MEM_HOST_WRITEBACK_QCOM) { - MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0); + MACE_CHECK(rpcmem_.SyncCacheStart(mapped_ptr) == 0); } } else { #endif // MACE_ENABLE_RPCMEM @@ -275,13 +258,13 @@ void *OpenCLAllocator::MapImage(void *buffer, return mapped_ptr; } -void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { +void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) { MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image"); #ifdef MACE_ENABLE_RPCMEM if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) { if (opencl_runtime_->qcom_host_cache_policy() == CL_MEM_HOST_WRITEBACK_QCOM) { - MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_END) == 0); + MACE_CHECK(rpcmem_.SyncCacheEnd(mapped_ptr) == 0); } } else { #endif // MACE_ENABLE_RPCMEM @@ -301,17 +284,20 @@ void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { bool OpenCLAllocator::OnHost() const { return false; } #ifdef MACE_ENABLE_RPCMEM +Rpcmem *OpenCLAllocator::rpcmem() { + return &rpcmem_; +} + void OpenCLAllocator::CreateQualcommBufferIONHostPtr( const size_t nbytes, cl_mem_ion_host_ptr *ion_host) { - void *host = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED, - nbytes + opencl_runtime_->qcom_ext_mem_padding()); + void *host = rpcmem_.New(nbytes + opencl_runtime_->qcom_ext_mem_padding()); MACE_CHECK_NOTNULL(host); auto host_addr = reinterpret_cast(host); auto page_size = opencl_runtime_->qcom_page_size(); MACE_CHECK(host_addr % page_size == 0, "ION memory address: ", host_addr, " must be aligned to page size: ", page_size); - int fd = rpcmem_to_fd(host); + int fd = rpcmem_.ToFd(host); MACE_CHECK(fd >= 0, "Invalid rpcmem file descriptor: ", fd); ion_host->ext_host_ptr.allocation_type = CL_MEM_ION_HOST_PTR_QCOM; diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h index 0c2783a137ffc89aacc52e5d72a0a0a05a53d1d5..5411a93e710d08518879516f76b77fcfe1bdbe50 100644 --- a/mace/core/runtime/opencl/opencl_allocator.h +++ b/mace/core/runtime/opencl/opencl_allocator.h @@ -48,17 +48,21 @@ class OpenCLAllocator : public Allocator { void *Map(void *buffer, size_t offset, size_t nbytes, - bool finish_cmd_queue) const override; + bool finish_cmd_queue) override; void *MapImage(void *buffer, const std::vector &image_shape, std::vector *mapped_image_pitch, - bool finish_cmd_queue) const override; + bool finish_cmd_queue) override; - void Unmap(void *buffer, void *mapped_ptr) const override; + void Unmap(void *buffer, void *mapped_ptr) override; bool OnHost() const override; +#ifdef MACE_ENABLE_RPCMEM + Rpcmem *rpcmem() override; +#endif // MACE_ENABLE_RPCMEM + private: #ifdef MACE_ENABLE_RPCMEM void CreateQualcommBufferIONHostPtr(const size_t nbytes, @@ -69,6 +73,7 @@ class OpenCLAllocator : public Allocator { cl_mem_ion_host_ptr *ion_host); std::unordered_map cl_to_host_map_; + Rpcmem rpcmem_; #endif // MACE_ENABLE_RPCMEM OpenCLRuntime *opencl_runtime_; }; diff --git a/mace/ops/opencl/helper.cc b/mace/core/runtime/opencl/opencl_helper.cc similarity index 98% rename from mace/ops/opencl/helper.cc rename to mace/core/runtime/opencl/opencl_helper.cc index 16acafb5a244583c3d9b34df25755eb3d50284f7..a216b6215a22618e7a298ff4e915299b8f5ffb0e 100644 --- a/mace/ops/opencl/helper.cc +++ b/mace/core/runtime/opencl/opencl_helper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include #include @@ -22,7 +22,6 @@ #include "mace/utils/math.h" namespace mace { -namespace ops { std::vector FormatBufferShape( const std::vector &buffer_shape, @@ -59,8 +58,10 @@ std::string DtToCLDt(const DataType dt) { return "float"; case DT_HALF: return "half"; + case DT_UINT8: + return "uchar"; default: - LOG(FATAL) << "Unsupported data type"; + LOG(FATAL) << "Unsupported data type: " << dt; return ""; } } @@ -365,5 +366,4 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, return MaceStatus::MACE_SUCCESS; } -} // namespace ops } // namespace mace diff --git a/mace/ops/opencl/helper.h b/mace/core/runtime/opencl/opencl_helper.h similarity index 97% rename from mace/ops/opencl/helper.h rename to mace/core/runtime/opencl/opencl_helper.h index a9e9866c31e85bd82efb1d1b2622d429f8639c5a..4d10862a1da42d0b581e0e6a5e2e4d019425d0ca 100644 --- a/mace/ops/opencl/helper.h +++ b/mace/core/runtime/opencl/opencl_helper.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef MACE_OPS_OPENCL_HELPER_H_ -#define MACE_OPS_OPENCL_HELPER_H_ +#ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_ +#define MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_ #include #include @@ -30,7 +30,6 @@ #include "mace/utils/math.h" namespace mace { -namespace ops { // oorc for 'Out Of Range Check' #define MACE_OUT_OF_RANGE_DEFINITION \ std::shared_ptr oorc_flag; @@ -161,6 +160,5 @@ std::vector Default3DLocalWS(OpenCLRuntime *runtime, const uint32_t *gws, const uint32_t kwg_size); -} // namespace ops } // namespace mace -#endif // MACE_OPS_OPENCL_HELPER_H_ +#endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_HELPER_H_ diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel index 2d16fd94b17fbde485f0fa678a6fc74303e046c8..8b540b53d946df2751df3ba957d9bc0bdda2534a 100644 --- a/mace/libmace/BUILD.bazel +++ b/mace/libmace/BUILD.bazel @@ -21,6 +21,7 @@ load( "if_opencl_enabled", "if_openmp_enabled", "if_quantize_enabled", + "if_rpcmem_enabled", ) cc_library( @@ -47,6 +48,8 @@ cc_library( "-DMACE_ENABLE_HTA", ]) + if_apu_enabled([ "-DMACE_ENABLE_APU", + ]) + if_rpcmem_enabled([ + "-DMACE_ENABLE_RPCMEM", ]), deps = [ "//mace/ops", diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 3ab4b13ee1374b6db60c6d7fa2042051c4d1aeeb..b9d3b13c24f1490c688d775f51534c2094c6f377 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -520,9 +520,23 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) if (device_type_ == DeviceType::HEXAGON || device_type_ == DeviceType::HTA) { +#ifdef MACE_ENABLE_OPENCL + device_.reset(new HexagonDevice( + device_type_, thread_pool_.get(), + make_unique( + config.impl_->gpu_context()->opencl_tuner(), + config.impl_->gpu_context()->opencl_cache_storage(), + config.impl_->gpu_priority_hint(), + config.impl_->gpu_perf_hint(), + config.impl_->gpu_context()->opencl_binary_storage(), + config.impl_->num_threads(), + config.impl_->cpu_affinity_policy(), + thread_pool_.get()))); +#else device_.reset(new HexagonDevice(device_type_, thread_pool_.get())); +#endif // MACE_ENABLE_OPENCL } -#endif +#endif // defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) #ifdef MACE_ENABLE_APU if (device_type_ == DeviceType::APU) { device_.reset(new ApuDevice(thread_pool_.get())); @@ -579,15 +593,19 @@ MaceStatus MaceEngine::Impl::Init( << MakeString(MapKeys(output_info_map_)); } #if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) - DataType output_dt = output_info_map_[output_name].data_type(); - Tensor *output_tensor = - ws_->CreateTensor(output_name, device_->allocator(), output_dt); - output_tensor->set_data_format(DataFormat::NHWC); + if (device_type_ == HEXAGON || device_type_ == HTA) { + DataType output_dt = output_info_map_[output_name].data_type(); + Tensor *output_tensor = + ws_->CreateTensor(output_name, device_->allocator(), output_dt); + output_tensor->set_data_format(DataFormat::NHWC); + } #endif #if defined(MACE_ENABLE_APU) - Tensor *output_tensor = - ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT); - output_tensor->set_data_format(DataFormat::NHWC); + if (device_type_ == DeviceType::APU) { + Tensor *output_tensor = + ws_->CreateTensor(output_name, device_->allocator(), DT_FLOAT); + output_tensor->set_data_format(DataFormat::NHWC); + } #endif } #ifdef MACE_ENABLE_HEXAGON diff --git a/mace/ops/opencl/buffer/buffer_transform.h b/mace/ops/opencl/buffer/buffer_transform.h index c32ccbb13069ea800aa30b2ac8cd8a2eb6cac2b5..25415877e676707aab857fd09e81d4821ae99361 100644 --- a/mace/ops/opencl/buffer/buffer_transform.h +++ b/mace/ops/opencl/buffer/buffer_transform.h @@ -21,7 +21,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/buffer_type_transform.cc b/mace/ops/opencl/buffer/buffer_type_transform.cc index 2cb3ae0043df20ddfa25421572db5377f0c12363..688ded664fa7dac533fd7fbafcfc7d1d8fbf9cdc 100644 --- a/mace/ops/opencl/buffer/buffer_type_transform.cc +++ b/mace/ops/opencl/buffer/buffer_type_transform.cc @@ -15,7 +15,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/conv_2d.h b/mace/ops/opencl/buffer/conv_2d.h index c50752c3bc6abeaaabc961084d72e8f7afba9f76..563b835861ea76c6cb90b8ad27f2fa4c9d09e955 100644 --- a/mace/ops/opencl/buffer/conv_2d.h +++ b/mace/ops/opencl/buffer/conv_2d.h @@ -21,7 +21,7 @@ #include #include "mace/ops/opencl/buffer/utils.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/memory.h" namespace mace { diff --git a/mace/ops/opencl/buffer/conv_2d_1x1.cc b/mace/ops/opencl/buffer/conv_2d_1x1.cc index 6eeb0f1d1584eb4eb14fd749602895437286e766..95c85b17dd24438a8c9bd45c974b7c23c46c85be 100644 --- a/mace/ops/opencl/buffer/conv_2d_1x1.cc +++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc @@ -15,7 +15,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/conv_2d_general.cc b/mace/ops/opencl/buffer/conv_2d_general.cc index b19b702083bbdeb2f94b2d6ab8e7e13a02c3ab12..4c03ee2af0c5b5452878db16067fff114088884c 100644 --- a/mace/ops/opencl/buffer/conv_2d_general.cc +++ b/mace/ops/opencl/buffer/conv_2d_general.cc @@ -15,7 +15,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/depthwise_conv2d.h b/mace/ops/opencl/buffer/depthwise_conv2d.h index 98dffa12734b8404221869d147420a2e76866224..60d680777ba06af2aec2c04ff42dcad6a5bd5caa 100644 --- a/mace/ops/opencl/buffer/depthwise_conv2d.h +++ b/mace/ops/opencl/buffer/depthwise_conv2d.h @@ -21,7 +21,7 @@ #include #include "mace/ops/opencl/buffer/utils.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/memory.h" namespace mace { diff --git a/mace/ops/opencl/buffer/pooling.h b/mace/ops/opencl/buffer/pooling.h index 9e675e29aa14bd12409f0a1315fe34c023a73b5d..952a8f5e4b7ef4ea743c64dba48575e77560502d 100644 --- a/mace/ops/opencl/buffer/pooling.h +++ b/mace/ops/opencl/buffer/pooling.h @@ -23,7 +23,7 @@ #include #include "mace/ops/opencl/buffer/utils.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/memory.h" namespace mace { diff --git a/mace/ops/opencl/buffer/reshape.h b/mace/ops/opencl/buffer/reshape.h index f030f1e759b8fc7bf837d4b1054c062dcdac8338..bc572acfdf1bf42b18cd1f326b0d3d455ec9ad50 100644 --- a/mace/ops/opencl/buffer/reshape.h +++ b/mace/ops/opencl/buffer/reshape.h @@ -19,7 +19,7 @@ #include -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/softmax.h b/mace/ops/opencl/buffer/softmax.h index 05d27cac7f4bdcd408c6b25b958e6414bde8249a..0acae465953c75fc6d053b8d6c90040a17f75818 100644 --- a/mace/ops/opencl/buffer/softmax.h +++ b/mace/ops/opencl/buffer/softmax.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/buffer/utils.cc b/mace/ops/opencl/buffer/utils.cc index 536a7e0df8e4a875bb80c46a67613a246c9e221b..3e15115cc909465ccf1ddf3eed0501f70c0b4267 100644 --- a/mace/ops/opencl/buffer/utils.cc +++ b/mace/ops/opencl/buffer/utils.cc @@ -19,7 +19,7 @@ #include #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/cl/buffer_transform.cl b/mace/ops/opencl/cl/buffer_transform.cl index 0a7674d66f5fbc08b4443da288f26ac5ff5c1d47..9e554dcfe41b9503a0b6cf649c5f190710a480fe 100644 --- a/mace/ops/opencl/cl/buffer_transform.cl +++ b/mace/ops/opencl/cl/buffer_transform.cl @@ -209,3 +209,202 @@ __kernel void transform_data_type(BUFFER_OUT_OF_RANGE_PARAMS DATA_TYPE4 input_value = CONVERT4(vload4(out_idx, input + input_offset)); vstore4(input_value, out_idx, output); } + +__kernel void buffer_quantize(BUFFER_OUT_OF_RANGE_PARAMS + __private const int global_size_dim0, + __private const float scale, + __private const int zero_point, + __global float *input, + __private const int input_offset, + __global uchar *output) { + const int out_idx = get_global_id(0); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_idx >= global_size_dim0) { + return; + } +#endif + + uchar4 output_value = + convert_uchar4_sat_rte(vload4(out_idx, input) / scale + zero_point); + vstore4(output_value, out_idx, output); +} + +__kernel void buffer_dequantize(BUFFER_OUT_OF_RANGE_PARAMS + __private const int global_size_dim0, + __private const float scale, + __private const int zero_point, + __global uchar *input, + __private const int input_offset, + __global float *output) { + const int out_idx = get_global_id(0); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_idx >= global_size_dim0) { + return; + } +#endif + float4 output_value = + convert_float4(convert_int4(vload4(out_idx, input)) - zero_point) * scale; + vstore4(output_value, out_idx, output); +} + + +// NHWC -> NCHW (W roundup to 32) +__kernel void transform_nhwc_to_nchw32(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global uchar *input, // NHWC + __private const int input_offset, + __private const int zero_point, + __global uchar *output, + __private const int batch, + __private const int height, + __private const int width, + __private const int channels) { + const int width_blk_idx = get_global_id(0); + const int h_idx = get_global_id(1); + const int bc_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (width_blk_idx >= global_size_dim0 || + h_idx >= global_size_dim1 || + bc_idx >= global_size_dim2) { + return; + } +#endif + const int b_idx = bc_idx / channels; + const int chan_idx = bc_idx - mul24(b_idx, channels); + const int w_idx = width_blk_idx << 2; + const int padded_width = global_size_dim0 << 2; + + const int in_offset = mad24(mad24(mad24(b_idx, height, h_idx), + width, w_idx), channels, chan_idx) + input_offset; + const int out_offset = (mad24(mad24(mad24(b_idx, channels, chan_idx), + height, h_idx), padded_width, w_idx)); + + uchar4 value = zero_point; + if (w_idx + 3 < width) { + value.x = input[in_offset]; + value.y = input[in_offset + channels]; + value.z = input[in_offset + 2 * channels]; + value.w = input[in_offset + 3 * channels]; + } else if (w_idx < width) { + const int diff = width - w_idx; + switch(diff) { + case 3: + value.z = input[in_offset + 2 * channels]; + case 2: + value.y = input[in_offset + channels]; + case 1: + value.x = input[in_offset]; + } + } + VSTORE4(value, output, out_offset); +} + + +// N H ceil(C/32) W 32 -> NHWC +__kernel void transform_d32_to_nhwc(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global uchar *input, // D32 + __private const int input_offset, + __global uchar *output, // NHWC + __private const int batch, + __private const int height, + __private const int width, + __private const int channels, + __private const int channel_slices) { + const int chan_blk_idx = get_global_id(0); + const int w_idx = get_global_id(1); + const int bh_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (chan_blk_idx >= global_size_dim0 || + w_idx >= global_size_dim1 || + bh_idx >= global_size_dim2) { + return; + } +#endif + const int b_idx = bh_idx / height; + const int h_idx = bh_idx - mul24(b_idx, height); + const int c_idx = chan_blk_idx << 2; + const int c_slice = c_idx >> 5; + const int c_slice_idx = c_idx & 31; + + const int in_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx), + channel_slices, c_slice), width, w_idx), 32, c_slice_idx) + input_offset; + const int out_offset = (mad24(mad24(mad24(b_idx, height, h_idx), + width, w_idx), channels, c_idx)); + + uchar4 value = vload4(0, input + in_offset); + if (c_idx + 3 < channels) { + VSTORE4(value, output, out_offset); + } else { + const int diff = channels - c_idx; + switch(diff) { + case 3: + vstore3(value.xyz, 0, output + out_offset); + break; + case 2: + vstore2(value.xy, 0, output + out_offset); + break; + case 1: + output[out_offset] = value.x; + break; + } + } +} + +// NHWC -> N H ceil(C/32) W 32 +__kernel void transform_nhwc_to_d32(BUFFER_OUT_OF_RANGE_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM3 + __global uchar *input, // NHWC + __private const int input_offset, + __private const int zero_point, + __global uchar *output, // D32 + __private const int batch, + __private const int height, + __private const int width, + __private const int channels, + __private const int channel_slices) { + const int w_32_idx = get_global_id(0); + const int c_slice = get_global_id(1); + const int bh_idx = get_global_id(2); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w_32_idx >= global_size_dim0 || + c_slice >= global_size_dim1 || + bh_idx >= global_size_dim2) { + return; + } +#endif + const int b_idx = bh_idx / height; + const int h_idx = bh_idx - mul24(b_idx, height); + const int w_idx = w_32_idx >> 3; + const int c_slice_blk_idx = w_32_idx & 7; + const int c_slice_idx = c_slice_blk_idx << 2; + const int c_idx = (c_slice << 5) + c_slice_idx; + + const int in_offset = (mad24(mad24(mad24(b_idx, height, h_idx), + width, w_idx), channels, c_idx)) + input_offset; + const int out_offset = mad24(mad24(mad24(mad24(b_idx, height, h_idx), + channel_slices, c_slice), width, w_idx), 32, c_slice_idx); + + uchar4 value = zero_point; + if (c_idx + 3 < channels) { + value = vload4(0, input + in_offset); + } else if (c_idx < channels) { + value = vload4(0, input + in_offset); + const int diff = channels - c_idx; + switch(diff) { + case 3: + value.w = zero_point; break; + case 2: + value.zw = zero_point; break; + case 1: + value.yzw = zero_point; break; + } + } // else value = zero_point + + VSTORE4(value, output, out_offset); +} diff --git a/mace/ops/opencl/image/activation.h b/mace/ops/opencl/image/activation.h index e98b5e9daefe0cf988b6cb39ee7e0cf4903ea89b..929d267ddd2860161c45eb63b3be465e870298ed 100644 --- a/mace/ops/opencl/image/activation.h +++ b/mace/ops/opencl/image/activation.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/addn.h b/mace/ops/opencl/image/addn.h index b163152bf15838c385b38690c75f8f92499b5ae2..575dee22764af5e856ec19792f5fce60634f906b 100644 --- a/mace/ops/opencl/image/addn.h +++ b/mace/ops/opencl/image/addn.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/batch_norm.h b/mace/ops/opencl/image/batch_norm.h index b2201a96631fef8ddd3b1a1748550aa96897e646..6b7773682ff546753b75f2f94f0fb2282a0b39fc 100644 --- a/mace/ops/opencl/image/batch_norm.h +++ b/mace/ops/opencl/image/batch_norm.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/batch_to_space.h b/mace/ops/opencl/image/batch_to_space.h index a0aced7c021cdee7dfe55b3800e9da324e7abf59..a9d047aa2a7af096b535f2086afe9450beed46c3 100644 --- a/mace/ops/opencl/image/batch_to_space.h +++ b/mace/ops/opencl/image/batch_to_space.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/bias_add.h b/mace/ops/opencl/image/bias_add.h index 7c25662da81b183ca88588dc756b724b50ed33ac..67644d6a1f58b99fc0c3d4d2d4021c1e2e178adb 100644 --- a/mace/ops/opencl/image/bias_add.h +++ b/mace/ops/opencl/image/bias_add.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/buffer_to_image.h b/mace/ops/opencl/image/buffer_to_image.h index 493f6579db7ced93681ad2b8b80b491edd934b8d..3389118279f3cdf7c8050e1bb5fd17c9e154530d 100644 --- a/mace/ops/opencl/image/buffer_to_image.h +++ b/mace/ops/opencl/image/buffer_to_image.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/channel_shuffle.h b/mace/ops/opencl/image/channel_shuffle.h index 371ecf22a6cf61e3e7c60b8af4abe981f3a1264e..94448d805ccb86887bb1b9e12bce0cfba66db4a4 100644 --- a/mace/ops/opencl/image/channel_shuffle.h +++ b/mace/ops/opencl/image/channel_shuffle.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/concat.h b/mace/ops/opencl/image/concat.h index f1e51fd96e7312d30419eafada40796f000c55c2..e5cd297779f7adb583653e31d25aa5816a377d4f 100644 --- a/mace/ops/opencl/image/concat.h +++ b/mace/ops/opencl/image/concat.h @@ -21,7 +21,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h index a1ee3301b373d43980d38b1fd38fb7876c5c47d2..6044c1a7235535cc0f67dcdc716b25189ed7a3d4 100644 --- a/mace/ops/opencl/image/conv_2d.h +++ b/mace/ops/opencl/image/conv_2d.h @@ -21,7 +21,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc index 718240152f6a74e0835c75edbae7782a1fb8c23f..494672a4447cf0ed9e8611e11a241f9cc1387816 100644 --- a/mace/ops/opencl/image/conv_2d_1x1.cc +++ b/mace/ops/opencl/image/conv_2d_1x1.cc @@ -15,7 +15,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc index d8a8b9cfbba611e5d0a320e8708f7f08c0a2b844..8bfc988c8ebf4057b9a2942f632594d14cfcf7d0 100644 --- a/mace/ops/opencl/image/conv_2d_3x3.cc +++ b/mace/ops/opencl/image/conv_2d_3x3.cc @@ -15,7 +15,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/math.h" namespace mace { diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc index bf4baea72da0e0776eb3223e71650b475c994906..9964c5f25cba4b39e401ac39764bd6d29b6f62f1 100644 --- a/mace/ops/opencl/image/conv_2d_general.cc +++ b/mace/ops/opencl/image/conv_2d_general.cc @@ -14,7 +14,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/ops/common/activation_type.h" #include "mace/utils/math.h" diff --git a/mace/ops/opencl/image/crop.h b/mace/ops/opencl/image/crop.h index c2f1c53aa2383ab89669be2520a9af3c1f2a27c8..33a5d2603e56e297b0c0271ad806009b38550a07 100644 --- a/mace/ops/opencl/image/crop.h +++ b/mace/ops/opencl/image/crop.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/deconv_2d.h b/mace/ops/opencl/image/deconv_2d.h index aa3b9d249b58cd3982866c71ef07b30ee24c75bc..4f1db7e66fa4580690bd648c259543dce292083d 100644 --- a/mace/ops/opencl/image/deconv_2d.h +++ b/mace/ops/opencl/image/deconv_2d.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/depth_to_space.h b/mace/ops/opencl/image/depth_to_space.h index ac68cbdb02adffad9b7bfc911363b1c95d2a7a86..383a4c6f3fed98c2b4cec5b36121004a78a0109f 100644 --- a/mace/ops/opencl/image/depth_to_space.h +++ b/mace/ops/opencl/image/depth_to_space.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/depthwise_conv2d.h b/mace/ops/opencl/image/depthwise_conv2d.h index f4bc4f2a0c92adc2edca4c5eb820c2f00f63d680..c72170acdb1c15ebf27dbd327d64b5b73d40de2e 100644 --- a/mace/ops/opencl/image/depthwise_conv2d.h +++ b/mace/ops/opencl/image/depthwise_conv2d.h @@ -21,7 +21,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/depthwise_deconv2d.h b/mace/ops/opencl/image/depthwise_deconv2d.h index 2055511678d8340da655b298c2a8a163279c95a3..fe039cb679c449f0d432b86531d17795cb3e83e6 100644 --- a/mace/ops/opencl/image/depthwise_deconv2d.h +++ b/mace/ops/opencl/image/depthwise_deconv2d.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/eltwise.h b/mace/ops/opencl/image/eltwise.h index 5678f9c72ffbe2ec706e2b44bd73457f938cb585..a9298cc6582e4e5f8d805c1a0d00f9f65e99de0b 100644 --- a/mace/ops/opencl/image/eltwise.h +++ b/mace/ops/opencl/image/eltwise.h @@ -25,7 +25,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/common/eltwise_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/fully_connected.h b/mace/ops/opencl/image/fully_connected.h index 9f1bae647f33c5906f31f312c5a094d64ef322e6..010edcac9979c659e6d926e076d941d9fea426dd 100644 --- a/mace/ops/opencl/image/fully_connected.h +++ b/mace/ops/opencl/image/fully_connected.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/common/activation_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/image_to_buffer.h b/mace/ops/opencl/image/image_to_buffer.h index 85893f6b283da5b659d30466568c65c52d931954..5d5c524884c0ccb6ce976ee8cd45d345c445e20d 100644 --- a/mace/ops/opencl/image/image_to_buffer.h +++ b/mace/ops/opencl/image/image_to_buffer.h @@ -21,7 +21,7 @@ #include "mace/core/op_context.h" #include "mace/ops/opencl/buffer_transform_kernel.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/lpnorm.h b/mace/ops/opencl/image/lpnorm.h index ae517c28f018b595e78e56b6a59ea939e23276df..cac641125d99d4e93495d67b45e00f0f27bb3c7c 100644 --- a/mace/ops/opencl/image/lpnorm.h +++ b/mace/ops/opencl/image/lpnorm.h @@ -17,7 +17,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/ops/opencl/lpnorm.h" namespace mace { diff --git a/mace/ops/opencl/image/lstm_cell.h b/mace/ops/opencl/image/lstm_cell.h index 006374f9d099df01f866231f1756c97ec4b16190..998d8147675c2dd1b3ade1b782055a86117aea83 100644 --- a/mace/ops/opencl/image/lstm_cell.h +++ b/mace/ops/opencl/image/lstm_cell.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/matmul.h b/mace/ops/opencl/image/matmul.h index afd4792cba2eb3a33ccbf88959481ebb0cb3f225..8ee05239b798d8c8b6f660fa4aea335ded3549b7 100644 --- a/mace/ops/opencl/image/matmul.h +++ b/mace/ops/opencl/image/matmul.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/mvnorm.h b/mace/ops/opencl/image/mvnorm.h index 9ff6d47f4547227dc54c22e4709b1ca26ff4d3b9..f6e609d27240612a0c53141ce409790b6b826234 100644 --- a/mace/ops/opencl/image/mvnorm.h +++ b/mace/ops/opencl/image/mvnorm.h @@ -20,7 +20,7 @@ #include "mace/core/op_context.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/ops/opencl/mvnorm.h" namespace mace { diff --git a/mace/ops/opencl/image/pad.h b/mace/ops/opencl/image/pad.h index f4b8278bbbfc4f67e1e16622baac4517ac441fb6..3df88f34e31020a848ee34d9c958cf8bc0200b32 100644 --- a/mace/ops/opencl/image/pad.h +++ b/mace/ops/opencl/image/pad.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" #include "mace/ops/common/pad_type.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/pooling.h b/mace/ops/opencl/image/pooling.h index 8d709368c8f9d2154cbd60eb07c1a9742fc2f506..5c0e14a52b544e65af82bfd05bcc2a939e9d2a1b 100644 --- a/mace/ops/opencl/image/pooling.h +++ b/mace/ops/opencl/image/pooling.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/reduce.h b/mace/ops/opencl/image/reduce.h index 992ac1b1491c1ccfeba27ad39b743ab568354797..0dfb48b427a25df89e475e45873d0ec69197f95a 100644 --- a/mace/ops/opencl/image/reduce.h +++ b/mace/ops/opencl/image/reduce.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/ops/common/reduce_type.h" namespace mace { diff --git a/mace/ops/opencl/image/reshape.h b/mace/ops/opencl/image/reshape.h index 4004fb5e904c4105f3ae5615e8dde37c557a62e1..60be5fe0272c8827ce95003613ba0e07ab025396 100644 --- a/mace/ops/opencl/image/reshape.h +++ b/mace/ops/opencl/image/reshape.h @@ -21,7 +21,7 @@ #include #include "mace/core/operator.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/ops/opencl/buffer_transform_kernel.h" namespace mace { diff --git a/mace/ops/opencl/image/resize_bicubic.h b/mace/ops/opencl/image/resize_bicubic.h index cb215f19aa6a22fb3f919b2048b85e084c35667e..5abc553974e0c3fb1a4c2056ec140baf70e736cd 100644 --- a/mace/ops/opencl/image/resize_bicubic.h +++ b/mace/ops/opencl/image/resize_bicubic.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/resize_bilinear.h b/mace/ops/opencl/image/resize_bilinear.h index 68b1478dc81d620cc2bde198b02c221913b7939f..ca3602d33942da03de3aa3f3cb093513af74a324 100644 --- a/mace/ops/opencl/image/resize_bilinear.h +++ b/mace/ops/opencl/image/resize_bilinear.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/resize_nearest_neighbor.h b/mace/ops/opencl/image/resize_nearest_neighbor.h index 9e2cec61a822e4e86e139e6bfe299771a94794d6..8bb10d4b2fd56046a689beae3e9abb3f0671f05e 100644 --- a/mace/ops/opencl/image/resize_nearest_neighbor.h +++ b/mace/ops/opencl/image/resize_nearest_neighbor.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/softmax.h b/mace/ops/opencl/image/softmax.h index 505dff57c9a7caf718a4a7f98ab3d6ffe58a5565..525f1edc51ee8bc7637a2c9c83ffa876d67ab4b2 100644 --- a/mace/ops/opencl/image/softmax.h +++ b/mace/ops/opencl/image/softmax.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/space_to_batch.h b/mace/ops/opencl/image/space_to_batch.h index 6ad5d22833e2ff2104c974bd77f6da5c76af1ad3..20777dc88453bc1746aab4e50c2c20f98babecec 100644 --- a/mace/ops/opencl/image/space_to_batch.h +++ b/mace/ops/opencl/image/space_to_batch.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/space_to_depth.h b/mace/ops/opencl/image/space_to_depth.h index 324977ea45c518a4a7a46520f0b5626c82716ea2..661e09af222ebf8ae07082d4192878d8e4703f36 100644 --- a/mace/ops/opencl/image/space_to_depth.h +++ b/mace/ops/opencl/image/space_to_depth.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/split.h b/mace/ops/opencl/image/split.h index 956ff6573a60ed2050d5b526f58734cdc8fdff43..20e1936207dca72126efba0a1b80a3bafa149012 100644 --- a/mace/ops/opencl/image/split.h +++ b/mace/ops/opencl/image/split.h @@ -24,7 +24,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h index bd2d1e7f394e7ed98eb8bd4e948da32e615349be..5acddb2556946b42ad1062ce6ec8c7bcf255e2cf 100644 --- a/mace/ops/opencl/image/sqrdiff_mean.h +++ b/mace/ops/opencl/image/sqrdiff_mean.h @@ -23,7 +23,7 @@ #include "mace/core/op_context.h" #include "mace/core/tensor.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" namespace mace { namespace ops { diff --git a/mace/ops/opencl/image/winograd_conv2d.cc b/mace/ops/opencl/image/winograd_conv2d.cc index 1ea2634a022e7614bcc600e3e34827e7a4aa8338..fd7cdfe67f1f37b4f1701d77d28f0759829594dc 100644 --- a/mace/ops/opencl/image/winograd_conv2d.cc +++ b/mace/ops/opencl/image/winograd_conv2d.cc @@ -16,7 +16,7 @@ #include "mace/core/op_context.h" #include "mace/ops/common/activation_type.h" #include "mace/ops/common/conv_pool_2d_util.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/memory.h" #include "mace/utils/math.h" diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc index 55f2931795893a3181e86828c19046380d7c991d..6d025026d728825c8c55dd44b60832840d8879c9 100644 --- a/mace/tools/mace_run.cc +++ b/mace/tools/mace_run.cc @@ -175,9 +175,9 @@ bool RunModel(const std::string &model_name, if (status != MaceStatus::MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } -#ifdef MACE_ENABLE_OPENCL +#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA) std::shared_ptr gpu_context; - if (device_type == DeviceType::GPU) { + if (device_type == DeviceType::GPU || device_type == DeviceType::HTA) { const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH"); const std::string storage_path = std::string(storage_path_ptr == nullptr ? diff --git a/test/ccunit/BUILD.bazel b/test/ccunit/BUILD.bazel index d3039c0067793111783cda70398f8ac7a35f094d..ce4d268cd3ba86b2badb605a4b21d1d10422afbc 100644 --- a/test/ccunit/BUILD.bazel +++ b/test/ccunit/BUILD.bazel @@ -9,6 +9,7 @@ load( "if_android", "if_android_armv7", "if_hexagon_enabled", + "if_hta_enabled", "if_neon_enabled", "if_opencl_enabled", "if_openmp_enabled", @@ -41,7 +42,9 @@ cc_test( [ "mace/ops/opencl/*.cc", ] - )), + )) + if_hta_enabled([ + "mace/core/runtime/hexagon/hta_transform_test.cc", + ]), copts = [ "-Werror", "-Wextra", @@ -59,6 +62,8 @@ cc_test( "-DMACE_ENABLE_QUANTIZE", ]) + if_hexagon_enabled([ "-DMACE_ENABLE_HEXAGON", + ]) + if_hta_enabled([ + "-DMACE_ENABLE_HTA", ]), linkopts = if_openmp_enabled([ "-fopenmp", diff --git a/test/ccunit/CMakeLists.txt b/test/ccunit/CMakeLists.txt index 21107815e9b604e4a0eb616db4d9c20c80186c6c..ae83e1a3aee54e0bb0bc9998607e38f1a22887f6 100644 --- a/test/ccunit/CMakeLists.txt +++ b/test/ccunit/CMakeLists.txt @@ -7,6 +7,10 @@ file(GLOB MACE_CC_TEST_SRCS mace/libmace/*.cc ) +if(MACE_ENABLE_HTA) + set(MACE_CC_TEST_SRCS ${MACE_CC_TEST_SRCS} mace/core/runtime/hexagon/hta_transform_test.cc) +endif(MACE_ENABLE_HTA) + add_executable(mace_cc_test ${MACE_CC_TEST_SRCS}) target_link_libraries(mace_cc_test PUBLIC mace_cc_test_utils diff --git a/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc b/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d123c8bd1e9955b58a4dd761e9705846e7d0af9 --- /dev/null +++ b/test/ccunit/mace/core/runtime/hexagon/hta_transform_test.cc @@ -0,0 +1,152 @@ +// Copyright 2020 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/quantize.h" +#include "mace/core/runtime/hexagon/hexagon_hta_transformer.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class HTATransformTest : public OpsTestBase {}; + +namespace { +template +void TestHTAQuantizeDequantize(const std::vector &input) { + float min_val, max_val; + FindMinMax(input.data(), input.size(), &min_val, &max_val); + float scale; + int32_t zero; + AdjustRange(min_val, max_val, false, &scale, &zero); + + OpsTestNet net; + Device *device = OpTestContext::Get()->GetDevice(D); + + net.AddInputFromArray("Input", + {static_cast(input.size())}, + input); + Tensor *input_tensor = net.GetOutput("Input"); + input_tensor->SetScale(scale); + input_tensor->SetZeroPoint(zero); + Tensor *quantized_output = net.ws()->CreateTensor( + "QuantizedOutput", device->allocator(), DT_UINT8); + Tensor *dequantized_output = net.ws()->CreateTensor( + "DequantizedOutput", device->allocator(), DT_FLOAT); + + mace::HexagonHTATranformer transformer; + transformer.Init(device); + transformer.Quantize(input_tensor, quantized_output); + transformer.Dequantize(quantized_output, dequantized_output); + + ExpectTensorNear(*input_tensor, + *dequantized_output, + 0.1); +} + +} // namespace + +TEST_F(HTATransformTest, TestHTAQuantize) { + TestHTAQuantizeDequantize({-2, -1, 0, 1, 2, 3, 4}); + TestHTAQuantizeDequantize({-2, -1, 0, 1, 2, 3, 4}); +} + +namespace { +void TestHTAInputTransform(const std::vector &input_shape, + const hexagon_hta_hw_layout format) { + OpsTestNet net; + Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU); + net.AddRandomInput("Input", input_shape); + Tensor *input_tensor = net.GetOutput("Input"); + input_tensor->SetScale(0.1); + input_tensor->SetZeroPoint(1); + Tensor *cpu_transformed_tensor = net.ws()->CreateTensor( + "CpuTransformedOutput", device->allocator(), DT_UINT8); + Tensor *gpu_transformed_tensor = net.ws()->CreateTensor( + "GpuTransformedOutput", device->allocator(), DT_UINT8); + + mace::HexagonHTATranformer cpu_transformer; + mace::HexagonHTATranformer gpu_transformer; + cpu_transformer.Init(device); + gpu_transformer.Init(device); + cpu_transformer.SetInputTransformer(format); + gpu_transformer.SetInputTransformer(format); + cpu_transformer.TransformInput(input_tensor, cpu_transformed_tensor, 0); + gpu_transformer.TransformInput(input_tensor, gpu_transformed_tensor, 0); + + net.Sync(); + ExpectTensorNear(*cpu_transformed_tensor, *gpu_transformed_tensor); +} + +} // namespace + +TEST_F(HTATransformTest, TestHTAInputTransform) { + TestHTAInputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_PLANAR); + TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR); + TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_PLANAR); + TestHTAInputTransform({1, 19, 31, 3}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAInputTransform({1, 15, 33, 27}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAInputTransform({1, 15, 33, 35}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAInputTransform({1, 224, 224, 3}, HEXAGON_HTA_HW_FORMAT_D32); +} + + +namespace { +void TestHTAOutputTransform(const std::vector &output_shape, + const hexagon_hta_hw_layout format) { + index_t batch = output_shape[0]; + index_t height = output_shape[1]; + index_t width = output_shape[2]; + index_t channels = output_shape[3]; + MACE_CHECK(format == HEXAGON_HTA_HW_FORMAT_D32); + std::vector input_shape { + batch, height, RoundUpDiv(channels, 32), width, 32}; + + OpsTestNet net; + Device *device = OpTestContext::Get()->GetDevice(DeviceType::GPU); + net.AddRandomInput("Input", input_shape); + Tensor *input_tensor = net.GetOutput("Input"); + Tensor *cpu_transformed_tensor = net.ws()->CreateTensor( + "CpuTransformedOutput", device->allocator(), DT_UINT8); + Tensor *gpu_transformed_tensor = net.ws()->CreateTensor( + "GpuTransformedOutput", device->allocator(), DT_UINT8); + cpu_transformed_tensor->Resize(output_shape); + gpu_transformed_tensor->Resize(output_shape); + + mace::HexagonHTATranformer cpu_transformer; + mace::HexagonHTATranformer gpu_transformer; + cpu_transformer.Init(device); + gpu_transformer.Init(device); + cpu_transformer.SetOutputTransformer(format); + gpu_transformer.SetOutputTransformer(format); + cpu_transformer.TransformOutput(input_tensor, cpu_transformed_tensor, 0); + gpu_transformer.TransformOutput(input_tensor, gpu_transformed_tensor, 0); + + net.Sync(); + ExpectTensorNear(*cpu_transformed_tensor, *gpu_transformed_tensor); +} + +} // namespace + +TEST_F(HTATransformTest, TestHTAOutputTransform) { + TestHTAOutputTransform({1, 15, 33, 2}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAOutputTransform({1, 19, 31, 27}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAOutputTransform({1, 19, 31, 35}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAOutputTransform({1, 224, 224, 2}, HEXAGON_HTA_HW_FORMAT_D32); + TestHTAOutputTransform({1, 384, 384, 3}, HEXAGON_HTA_HW_FORMAT_D32); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc index 5ee423d335b14eeda11c2148222e0e9fd854e925..3dfe468a8db889418c48a15776e79adccadf9319 100644 --- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc +++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc @@ -21,7 +21,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/tensor.h" #include "mace/core/workspace.h" -#include "mace/ops/opencl/helper.h" +#include "mace/core/runtime/opencl/opencl_helper.h" #include "mace/utils/memory.h" namespace mace { diff --git a/test/ccunit/mace/ops/quantize_test.cc b/test/ccunit/mace/ops/quantize_test.cc index ecfe5f5d07b638154fabb481348f4164b505fe5f..b6a86841717614ca2287909c296ad0573198cc0e 100644 --- a/test/ccunit/mace/ops/quantize_test.cc +++ b/test/ccunit/mace/ops/quantize_test.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "mace/core/quantize.h" #include "mace/ops/ops_test_util.h" namespace mace { diff --git a/test/ccutils/mace/ops/testing/test_utils.h b/test/ccutils/mace/ops/testing/test_utils.h index 6a0a045b6326a67689f9755bc911a2f54fbc798a..ef830781d7fd597599180d8882c47eda1800a3e8 100644 --- a/test/ccutils/mace/ops/testing/test_utils.h +++ b/test/ccutils/mace/ops/testing/test_utils.h @@ -250,8 +250,24 @@ struct Expector { Tensor::MappingGuard y_mapper(&y); auto a = x.data(); auto b = y.data(); - for (int i = 0; i < x.size(); ++i) { - ExpectEqual(a[i], b[i]); + if (x.dim_size() == 4) { + for (int n = 0; n < x.dim(0); ++n) { + for (int h = 0; h < x.dim(1); ++h) { + for (int w = 0; w < x.dim(2); ++w) { + for (int c = 0; c < x.dim(3); ++c) { + EXPECT_EQ(*a, *b) << "with index = [" << n << ", " << h << ", " + << w << ", " << c << "]"; + a++; + b++; + } + } + } + } + } else { + for (int i = 0; i < x.size(); ++i) { + EXPECT_EQ(a[i], b[i]) + << "a = " << a << " b = " << b << " index = " << i; + } } } diff --git a/third_party/nnlib/arm64-v8a/libhexagon_controller.so b/third_party/nnlib/arm64-v8a/libhexagon_controller.so index 5b17a007b440e9e068925ca3f347dc8d94ccb55a..5e2e85a1341ba040885086e45d8b03e85ae5c6d6 100755 Binary files a/third_party/nnlib/arm64-v8a/libhexagon_controller.so and b/third_party/nnlib/arm64-v8a/libhexagon_controller.so differ diff --git a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so index fc3e89b99f531afbfddba918c5ee4c8ffddb05ac..1ac9ae9d00ed1e3f8ee0acb04bb006a1d625bbcc 100755 Binary files a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so and b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so differ diff --git a/third_party/nnlib/hexnn_dsp_controller.h b/third_party/nnlib/hexnn_dsp_controller.h index 671a6c68a13889a7179acad5f709f49dad68e292..8818217d220729238af7d0b18bd1d6c06d8bfe6d 100644 --- a/third_party/nnlib/hexnn_dsp_controller.h +++ b/third_party/nnlib/hexnn_dsp_controller.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved. + * Copyright (c) 2019, The Linux Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted (subject to the limitations in the @@ -42,10 +42,6 @@ extern "C" { int hexnn_controller_request_unsigned_pd(); -int hexnn_controller_init(); - -int hexnn_controller_deinit(); - #ifdef __cplusplus } #endif diff --git a/third_party/rpcmem/arm64-v8a/rpcmem.a b/third_party/rpcmem/arm64-v8a/rpcmem.a index 982714b75645f2a408f7fde000376cf32d18055b..9bc7dea567e21127a9eede04904b678c881d2b7f 100644 Binary files a/third_party/rpcmem/arm64-v8a/rpcmem.a and b/third_party/rpcmem/arm64-v8a/rpcmem.a differ diff --git a/third_party/rpcmem/armeabi-v7a/rpcmem.a b/third_party/rpcmem/armeabi-v7a/rpcmem.a index faa1baa5f8e7a689e66ac126df0446c27d08e061..85f991e897b53977cfd02ce9fa12edb3953bfece 100644 Binary files a/third_party/rpcmem/armeabi-v7a/rpcmem.a and b/third_party/rpcmem/armeabi-v7a/rpcmem.a differ diff --git a/third_party/rpcmem/rpcmem.h b/third_party/rpcmem/rpcmem.h index f0296a2212d8fc57e64dbcee1c6cde3b5b0e821c..ac1889e02db582e702fe119a3be4d0e166416e46 100755 --- a/third_party/rpcmem/rpcmem.h +++ b/third_party/rpcmem/rpcmem.h @@ -6,6 +6,8 @@ #ifndef RPCMEM_H #define RPCMEM_H +#include + /** * RPCMEM_DEFAULT_HEAP * Dynamicaly select the heap to use. This should be ok for most usecases. @@ -77,16 +79,25 @@ extern "C" { #endif +typedef struct rpcmem rpcmem; +struct rpcmem { + void *lst; // QList* + void *mt; // pthread_mutex_t* + int ionfd; + int flag; + int ionversion; +}; + /** * call once to initialize the library - * NOTE: rpcmem_init is not thread safe + * NOTE: rpcmem_init is now thread safe */ -void rpcmem_init(void); +void rpcmem_init(rpcmem *rm); /** * call once for cleanup - * NOTE: rpcmem_deinit is not thread safe + * NOTE: rpcmem_deinit is now thread safe */ -void rpcmem_deinit(void); +void rpcmem_deinit(rpcmem *rm); /** * Allocate via ION a buffer of size @@ -96,10 +107,10 @@ void rpcmem_deinit(void); * @retval, 0 on failure, pointer to buffer on success * * For example: - * buf = rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size); + * buf = rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size); */ -void* rpcmem_alloc(int heapid, unsigned int flags, int size); +void* rpcmem_alloc(rpcmem *rm, int heapid, uint32_t flags, int size); /** * allocate with default settings @@ -107,24 +118,24 @@ void* rpcmem_alloc(int heapid, unsigned int flags, int size); #if !defined(WINNT) && !defined (_WIN32_WINNT) __attribute__((unused)) #endif -static __inline void* rpcmem_alloc_def(int size) { - return rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size); +static __inline void* rpcmem_alloc_def(rpcmem *rm, int size) { + return rpcmem_alloc(rm, RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size); } /** * free buffer, ignores invalid buffers */ -void rpcmem_free(void* po); +void rpcmem_free(rpcmem *rm, void* po); /** * returns associated fd */ -int rpcmem_to_fd(void* po); +int rpcmem_to_fd(rpcmem *rm, void* po); /** * cache coherency management */ -int rpcmem_sync_cache(void* po, unsigned int flags); +int rpcmem_sync_cache(rpcmem *rm, void* po, uint32_t flags); #ifdef __cplusplus } diff --git a/tools/bazel-build-standalone-lib.sh b/tools/bazel-build-standalone-lib.sh index 8a07811373c996793e7652fab21379086c146945..f01dd06510d19f747f44303a5f949276962ba0a1 100755 --- a/tools/bazel-build-standalone-lib.sh +++ b/tools/bazel-build-standalone-lib.sh @@ -21,6 +21,11 @@ mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_dsp mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_apu +if [[ "$BUILD_HTA" == "1" ]]; then + mkdir -p $LIB_DIR/armeabi-v7a/cpu_gpu_hta + mkdir -p $LIB_DIR/arm64-v8a/cpu_gpu_hta +fi + rm -rf $LIB_DIR/linux-x86-64 mkdir -p $LIB_DIR/linux-x86-64 @@ -33,6 +38,18 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu # build shared libraries +if [[ "$BUILD_HTA" == "1" ]]; then + echo "build shared lib for armeabi-v7a + cpu_gpu_hta" + bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true + cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/ + cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/ + + echo "build shared lib for arm64-v8a + cpu_gpu_hta" + bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true + cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_hta/ + cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/ +fi + echo "build shared lib for armeabi-v7a + cpu_gpu_dsp" bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ @@ -71,6 +88,18 @@ if [[ "$OSTYPE" != "darwin"* ]];then fi # build static libraries +if [[ "$BUILD_HTA" == "1" ]]; then + echo "build static lib for armeabi-v7a + cpu_gpu_hta" + bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true + cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_hta/ + cp third_party/hta/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_hta/ + + echo "build static lib for arm64-v8a + cpu_gpu_hta" + bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hta=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true + cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_hta/ + cp third_party/hta/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_hta/ +fi + echo "build static lib for armeabi-v7a + cpu_gpu_dsp" bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/ diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py index 1679f604d85cac55b47865a22fd87f021931a227..a0bed2496ad02066a14c7bbb4f89474c726c9a04 100644 --- a/tools/bazel_adb_run.py +++ b/tools/bazel_adb_run.py @@ -100,6 +100,11 @@ def parse_args(): type=str2bool, default=True, help="Whether to use rpcmem") + parser.add_argument( + "--enable_hta", + type=str2bool, + default=False, + help="Whether to use hta") parser.add_argument( '--address_sanitizer', action="store_true", @@ -170,6 +175,7 @@ def main(unused_args): enable_neon=FLAGS.enable_neon, enable_quantize=FLAGS.enable_quantize, enable_rpcmem=FLAGS.enable_rpcmem, + enable_hta=FLAGS.enable_hta, address_sanitizer=FLAGS.address_sanitizer, debug_mode=FLAGS.debug_mode) if FLAGS.run_target: diff --git a/tools/converter.py b/tools/converter.py index 324907943c9076d4291be201eb77847d84013564..dd9a6cbc9218eef8fa771210b9e2ae0eb2ebbbc5 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -218,7 +218,8 @@ def get_opencl_mode(configs): YAMLKeyword.runtime, "") runtime_list.append(model_runtime.lower()) - if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list: + if RuntimeType.gpu in runtime_list or RuntimeType.cpu_gpu in runtime_list \ + or RuntimeType.hta in runtime_list: return True return False diff --git a/tools/python/transform/hexagon_converter.py b/tools/python/transform/hexagon_converter.py index 3a99c5cfa1a23c4be1a567897de26071859bb936..6d6d8eaf27cc65015291e19f6bb55d0c3b02ea5e 100644 --- a/tools/python/transform/hexagon_converter.py +++ b/tools/python/transform/hexagon_converter.py @@ -295,6 +295,20 @@ class HexagonConverter(base_converter.ConverterInterface): else: index += 1 + if self._option.device == DeviceType.HTA.value: + # replace QuantizeINPUT_f_to_8 with INPUT + quantize_input_op.type = HexagonOp.INPUT.name + del quantize_input_op.output_shape[1:] + del quantize_input_op.output_type[1:] + del quantize_input_op.out_max_byte_size[1:] + + # replace first op's input min max with constant + self.add_constant_min_max_for_first_op(self._model.op[1]) + + # replace DequantizeOUTPUT_8tof with OUTPUT + dequantize_output_op.type = HexagonOp.OUTPUT.name + del dequantize_output_op.input[1:] + return quantize_input_op.output def add_node_id(self, model_inputs): diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 831d015d1b3613ce5088ff91adb5aab36f3bae55..219b8502fac4dd44f18886c2727b1633abd907ab 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -605,18 +605,20 @@ def create_internal_storage_dir(serialno, phone_data_dir): def push_depended_so_libs(libmace_dynamic_library_path, abi, phone_data_dir, serialno): - dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends", - libmace_dynamic_library_path) - src_file = "" - for dep in split_stdout(dep_so_libs): - if dep == "libgnustl_shared.so": - src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \ - "%s/libgnustl_shared.so" \ - % (os.environ["ANDROID_NDK_HOME"], abi) - elif dep == "libc++_shared.so": - src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \ - "%s/libc++_shared.so"\ - % (os.environ["ANDROID_NDK_HOME"], abi) + src_file = "%s/sources/cxx-stl/llvm-libc++/libs/" \ + "%s/libc++_shared.so" \ + % (os.environ["ANDROID_NDK_HOME"], abi) + try: + dep_so_libs = sh.bash(os.environ["ANDROID_NDK_HOME"] + "/ndk-depends", + libmace_dynamic_library_path) + except sh.ErrorReturnCode_127: + print("Find no ndk-depends, use default libc++_shared.so") + else: + for dep in split_stdout(dep_so_libs): + if dep == "libgnustl_shared.so": + src_file = "%s/sources/cxx-stl/gnu-libstdc++/4.9/libs/" \ + "%s/libgnustl_shared.so" \ + % (os.environ["ANDROID_NDK_HOME"], abi) print("push %s to %s" % (src_file, phone_data_dir)) adb_push(src_file, phone_data_dir, serialno)