From f763da2b859200b5e6a3d9238cadfc6ea652db83 Mon Sep 17 00:00:00 2001 From: liuqi Date: Tue, 17 Jul 2018 12:43:21 +0800 Subject: [PATCH] Remove unused tensor of GPU. --- mace/core/runtime/opencl/opencl_runtime.cc | 6 + mace/core/runtime/opencl/opencl_runtime.h | 1 + mace/core/runtime_failure_mock.cc | 8 +- mace/core/tensor.h | 10 ++ mace/core/workspace.cc | 121 +++++++++++++++------ mace/core/workspace.h | 2 + mace/kernels/opencl/buffer_to_image.cc | 3 + mace/libmace/mace.cc | 5 +- mace/public/mace_runtime.h | 6 +- 9 files changed, 125 insertions(+), 37 deletions(-) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 10d3088c..3e67ef52 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { return size; } +uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() { + uint64_t size = 0; + device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); + return size; +} + uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t size = 0; kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size); diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 31d29328..931df6c4 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -85,6 +85,7 @@ class OpenCLRuntime { void GetCallStats(const cl::Event &event, CallStats *stats); uint64_t GetDeviceMaxWorkGroupSize(); + uint64_t GetDeviceMaxMemAllocSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); bool IsNonUniformWorkgroupsSupported() const; diff --git a/mace/core/runtime_failure_mock.cc b/mace/core/runtime_failure_mock.cc index b28f51fe..3b47855e 100644 --- a/mace/core/runtime_failure_mock.cc +++ b/mace/core/runtime_failure_mock.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include #include #include "mace/core/runtime_failure_mock.h" @@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() { } // namespace bool ShouldMockRuntimeFailure() { - static unsigned int seed = time(NULL); static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv(); if (mock_runtime_failure_ratio > 1e-6) { - float random_ratio = rand_r(&seed) / static_cast(RAND_MAX); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0.0, 1.0); + float random_ratio = dis(gen); if (random_ratio < mock_runtime_failure_ratio) { VLOG(0) << "Mock runtime failure."; return true; diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 6dd41a4a..a40e55c6 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -105,18 +105,21 @@ class Tensor { dtype_(type), buffer_(nullptr), is_buffer_owner_(true), + unused_(false), name_("") {} Tensor(BufferBase *buffer, DataType dtype) : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), + unused_(false), name_("") {} Tensor(const BufferSlice &buffer_slice, DataType dtype) : dtype_(dtype), buffer_slice_(buffer_slice), is_buffer_owner_(false), + unused_(false), name_("") { buffer_ = &buffer_slice_; } @@ -133,6 +136,8 @@ class Tensor { inline void SetDtype(DataType dtype) { dtype_ = dtype; } + inline bool unused() const { return unused_; } + inline const std::vector &shape() const { return shape_; } inline index_t dim_size() const { return shape_.size(); } @@ -195,6 +200,10 @@ class Tensor { return static_cast(buffer_->raw_mutable_data()); } + inline void MarkUnused() { + unused_ = true; + } + inline void Clear() { MACE_CHECK_NOTNULL(buffer_); buffer_->Clear(raw_size()); @@ -362,6 +371,7 @@ class Tensor { BufferBase *buffer_; BufferSlice buffer_slice_; bool is_buffer_owner_; + bool unused_; std::string name_; MACE_DISABLE_COPY_AND_ASSIGN(Tensor); diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 3d03345d..cb6be4ec 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -18,6 +18,9 @@ #include #include "mace/core/arg_helper.h" +#ifdef MACE_ENABLE_OPENCL +#include "mace/core/runtime/opencl/opencl_runtime.h" +#endif #include "mace/core/workspace.h" #include "mace/utils/timer.h" @@ -85,41 +88,82 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, VLOG(3) << "Model data size: " << model_data_size; if (model_data_size > 0) { - if (type == DeviceType::CPU) { - tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type), - const_cast(model_data), - model_data_size)); - } else { - tensor_buffer_ = std::unique_ptr( - new Buffer(GetDeviceAllocator(type))); - MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); - tensor_buffer_->Map(nullptr); - tensor_buffer_->Copy(const_cast(model_data), - 0, model_data_size); - tensor_buffer_->UnMap(); - } - } +#ifdef MACE_ENABLE_OPENCL + if (type == DeviceType::GPU && + OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <= + static_cast(model_data_size)) { + for (auto &const_tensor : net_def.tensors()) { + MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); + VLOG(3) << "Tensor name: " << const_tensor.name() + << ", data type: " << const_tensor.data_type() << ", shape: " + << MakeString(std::vector(const_tensor.dims().begin(), + const_tensor.dims().end())); + std::vector dims; + for (const index_t d : const_tensor.dims()) { + dims.push_back(d); + } - for (auto &const_tensor : net_def.tensors()) { - MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); - VLOG(3) << "Tensor name: " << const_tensor.name() - << ", data type: " << const_tensor.data_type() << ", shape: " - << MakeString(std::vector(const_tensor.dims().begin(), - const_tensor.dims().end())); - std::vector dims; - for (const index_t d : const_tensor.dims()) { - dims.push_back(d); - } + std::unique_ptr tensor( + new Tensor(GetDeviceAllocator(type), + const_tensor.data_type())); + tensor->Resize(dims); - std::unique_ptr tensor( - new Tensor(BufferSlice(tensor_buffer_.get(), const_tensor.offset(), - const_tensor.data_size() * - GetEnumTypeSize(const_tensor.data_type())), - const_tensor.data_type())); + MACE_CHECK(tensor->size() == const_tensor.data_size(), + "Tensor's data_size not equal with the shape"); + MACE_CHECK(const_tensor.offset() + tensor->raw_size() <= + model_data_size, + "buffer offset + length (", + const_tensor.offset(), + " + ", + tensor->raw_size(), + ") should <= ", + model_data_size); + tensor->CopyBytes(model_data + const_tensor.offset(), + const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type())); - tensor->Reshape(dims); - tensor_map_[const_tensor.name()] = std::move(tensor); + tensor_map_[const_tensor.name()] = std::move(tensor); + } + } else { +#else + { +#endif + if (type == DeviceType::CPU) { + tensor_buffer_ = std::unique_ptr( + new Buffer(GetDeviceAllocator(type), + const_cast(model_data), + model_data_size)); + } else { + tensor_buffer_ = std::unique_ptr( + new Buffer(GetDeviceAllocator(type))); + MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); + tensor_buffer_->Map(nullptr); + tensor_buffer_->Copy(const_cast(model_data), + 0, model_data_size); + tensor_buffer_->UnMap(); + } + for (auto &const_tensor : net_def.tensors()) { + MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); + VLOG(3) << "Tensor name: " << const_tensor.name() + << ", data type: " << const_tensor.data_type() << ", shape: " + << MakeString(std::vector(const_tensor.dims().begin(), + const_tensor.dims().end())); + std::vector dims; + for (const index_t d : const_tensor.dims()) { + dims.push_back(d); + } + + std::unique_ptr tensor( + new Tensor(BufferSlice( + tensor_buffer_.get(), const_tensor.offset(), + const_tensor.data_size() * + GetEnumTypeSize(const_tensor.data_type())), + const_tensor.data_type())); + + tensor->Reshape(dims); + tensor_map_[const_tensor.name()] = std::move(tensor); + } + } } if (type == DeviceType::CPU || type == DeviceType::GPU) { @@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { } } +void Workspace::RemoveUnusedBuffer() { + auto iter = tensor_map_.begin(); + auto end_iter = tensor_map_.end(); + while (iter != end_iter) { + auto old_iter = iter++; + if (old_iter->second->unused()) { + tensor_map_.erase(old_iter); + } + } + + tensor_buffer_.reset(nullptr); +} + } // namespace mace diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 38e8777b..ec636128 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -53,6 +53,8 @@ class Workspace { ScratchBuffer *GetScratchBuffer(DeviceType device_type); + void RemoveUnusedBuffer(); + private: MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, DeviceType device_type); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 5efad285..b29f7e81 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor::operator()( }; } + // Mark the buffer unused. + const_cast(buffer)->MarkUnused(); + return MACE_SUCCESS; } diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index 93518f85..03731078 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init( #ifdef MACE_ENABLE_HEXAGON } #endif + if (device_type_ == DeviceType::GPU) { + ws_->RemoveUnusedBuffer(); + } return MaceStatus::MACE_SUCCESS; } @@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init( } model_data_ = LoadModelData(model_data_file, model_data_size_); - Init(net_def, input_nodes, output_nodes, model_data_); + MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_)); if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) { UnloadModelData(model_data_, model_data_size_); diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index 807155bb..a6a628b6 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -66,6 +66,7 @@ class KVStorageFactory { class __attribute__((visibility("default"))) FileStorageFactory : public KVStorageFactory { public: + // You have to make sure your APP have read and write permission of the path. explicit FileStorageFactory(const std::string &path); ~FileStorageFactory(); @@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory std::unique_ptr impl_; }; -// Set KV store factory used as OpenCL cache. (Call Once) +// Set Key-Value store factory. (Call Once) +// Now KVStorage is used to store the built OpenCL binaries to file, +// which could speed up the GPU initialization and first run. +// If do not call this API, the initialization maybe slow for GPU. __attribute__((visibility("default"))) void SetKVStorageFactory(std::shared_ptr storage_factory); -- GitLab