You need to sign in or sign up before continuing.
提交 f763da2b 编写于 作者: L liuqi

Remove unused tensor of GPU.

上级 691331bd
...@@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { ...@@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
return size; return size;
} }
uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
return size;
}
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
uint64_t size = 0; uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size); kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
......
...@@ -85,6 +85,7 @@ class OpenCLRuntime { ...@@ -85,6 +85,7 @@ class OpenCLRuntime {
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetDeviceMaxMemAllocSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
bool IsNonUniformWorkgroupsSupported() const; bool IsNonUniformWorkgroupsSupported() const;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <cstdlib> #include <random>
#include <string> #include <string>
#include "mace/core/runtime_failure_mock.h" #include "mace/core/runtime_failure_mock.h"
...@@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() { ...@@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() {
} // namespace } // namespace
bool ShouldMockRuntimeFailure() { bool ShouldMockRuntimeFailure() {
static unsigned int seed = time(NULL);
static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv(); static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv();
if (mock_runtime_failure_ratio > 1e-6) { if (mock_runtime_failure_ratio > 1e-6) {
float random_ratio = rand_r(&seed) / static_cast<float>(RAND_MAX); std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0, 1.0);
float random_ratio = dis(gen);
if (random_ratio < mock_runtime_failure_ratio) { if (random_ratio < mock_runtime_failure_ratio) {
VLOG(0) << "Mock runtime failure."; VLOG(0) << "Mock runtime failure.";
return true; return true;
......
...@@ -105,18 +105,21 @@ class Tensor { ...@@ -105,18 +105,21 @@ class Tensor {
dtype_(type), dtype_(type),
buffer_(nullptr), buffer_(nullptr),
is_buffer_owner_(true), is_buffer_owner_(true),
unused_(false),
name_("") {} name_("") {}
Tensor(BufferBase *buffer, DataType dtype) Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype), : dtype_(dtype),
buffer_(buffer), buffer_(buffer),
is_buffer_owner_(false), is_buffer_owner_(false),
unused_(false),
name_("") {} name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype) Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype), : dtype_(dtype),
buffer_slice_(buffer_slice), buffer_slice_(buffer_slice),
is_buffer_owner_(false), is_buffer_owner_(false),
unused_(false),
name_("") { name_("") {
buffer_ = &buffer_slice_; buffer_ = &buffer_slice_;
} }
...@@ -133,6 +136,8 @@ class Tensor { ...@@ -133,6 +136,8 @@ class Tensor {
inline void SetDtype(DataType dtype) { dtype_ = dtype; } inline void SetDtype(DataType dtype) { dtype_ = dtype; }
inline bool unused() const { return unused_; }
inline const std::vector<index_t> &shape() const { return shape_; } inline const std::vector<index_t> &shape() const { return shape_; }
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
...@@ -195,6 +200,10 @@ class Tensor { ...@@ -195,6 +200,10 @@ class Tensor {
return static_cast<T *>(buffer_->raw_mutable_data()); return static_cast<T *>(buffer_->raw_mutable_data());
} }
inline void MarkUnused() {
unused_ = true;
}
inline void Clear() { inline void Clear() {
MACE_CHECK_NOTNULL(buffer_); MACE_CHECK_NOTNULL(buffer_);
buffer_->Clear(raw_size()); buffer_->Clear(raw_size());
...@@ -362,6 +371,7 @@ class Tensor { ...@@ -362,6 +371,7 @@ class Tensor {
BufferBase *buffer_; BufferBase *buffer_;
BufferSlice buffer_slice_; BufferSlice buffer_slice_;
bool is_buffer_owner_; bool is_buffer_owner_;
bool unused_;
std::string name_; std::string name_;
MACE_DISABLE_COPY_AND_ASSIGN(Tensor); MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
#include <utility> #include <utility>
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
...@@ -85,41 +88,82 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def, ...@@ -85,41 +88,82 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
VLOG(3) << "Model data size: " << model_data_size; VLOG(3) << "Model data size: " << model_data_size;
if (model_data_size > 0) { if (model_data_size > 0) {
if (type == DeviceType::CPU) { #ifdef MACE_ENABLE_OPENCL
tensor_buffer_ = std::unique_ptr<Buffer>( if (type == DeviceType::GPU &&
new Buffer(GetDeviceAllocator(type), OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
const_cast<unsigned char*>(model_data), static_cast<uint64_t>(model_data_size)) {
model_data_size)); for (auto &const_tensor : net_def.tensors()) {
} else { MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
tensor_buffer_ = std::unique_ptr<Buffer>( VLOG(3) << "Tensor name: " << const_tensor.name()
new Buffer(GetDeviceAllocator(type))); << ", data type: " << const_tensor.data_type() << ", shape: "
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size)); << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
tensor_buffer_->Map(nullptr); const_tensor.dims().end()));
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data), std::vector<index_t> dims;
0, model_data_size); for (const index_t d : const_tensor.dims()) {
tensor_buffer_->UnMap(); dims.push_back(d);
} }
}
for (auto &const_tensor : net_def.tensors()) { std::unique_ptr<Tensor> tensor(
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); new Tensor(GetDeviceAllocator(type),
VLOG(3) << "Tensor name: " << const_tensor.name() const_tensor.data_type()));
<< ", data type: " << const_tensor.data_type() << ", shape: " tensor->Resize(dims);
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
std::unique_ptr<Tensor> tensor( MACE_CHECK(tensor->size() == const_tensor.data_size(),
new Tensor(BufferSlice(tensor_buffer_.get(), const_tensor.offset(), "Tensor's data_size not equal with the shape");
const_tensor.data_size() * MACE_CHECK(const_tensor.offset() + tensor->raw_size() <=
GetEnumTypeSize(const_tensor.data_type())), model_data_size,
const_tensor.data_type())); "buffer offset + length (",
const_tensor.offset(),
" + ",
tensor->raw_size(),
") should <= ",
model_data_size);
tensor->CopyBytes(model_data + const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type()));
tensor->Reshape(dims); tensor_map_[const_tensor.name()] = std::move(tensor);
tensor_map_[const_tensor.name()] = std::move(tensor); }
} else {
#else
{
#endif
if (type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type),
const_cast<unsigned char*>(model_data),
model_data_size));
} else {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type)));
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size);
tensor_buffer_->UnMap();
}
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(
tensor_buffer_.get(), const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
}
}
} }
if (type == DeviceType::CPU || type == DeviceType::GPU) { if (type == DeviceType::CPU || type == DeviceType::GPU) {
...@@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { ...@@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
} }
} }
void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
while (iter != end_iter) {
auto old_iter = iter++;
if (old_iter->second->unused()) {
tensor_map_.erase(old_iter);
}
}
tensor_buffer_.reset(nullptr);
}
} // namespace mace } // namespace mace
...@@ -53,6 +53,8 @@ class Workspace { ...@@ -53,6 +53,8 @@ class Workspace {
ScratchBuffer *GetScratchBuffer(DeviceType device_type); ScratchBuffer *GetScratchBuffer(DeviceType device_type);
void RemoveUnusedBuffer();
private: private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def, MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type); DeviceType device_type);
......
...@@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()( ...@@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
}; };
} }
// Mark the buffer unused.
const_cast<Tensor *>(buffer)->MarkUnused();
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init(
#ifdef MACE_ENABLE_HEXAGON #ifdef MACE_ENABLE_HEXAGON
} }
#endif #endif
if (device_type_ == DeviceType::GPU) {
ws_->RemoveUnusedBuffer();
}
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
...@@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init( ...@@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init(
} }
model_data_ = LoadModelData(model_data_file, model_data_size_); model_data_ = LoadModelData(model_data_file, model_data_size_);
Init(net_def, input_nodes, output_nodes, model_data_); MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));
if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) { if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) {
UnloadModelData(model_data_, model_data_size_); UnloadModelData(model_data_, model_data_size_);
......
...@@ -66,6 +66,7 @@ class KVStorageFactory { ...@@ -66,6 +66,7 @@ class KVStorageFactory {
class __attribute__((visibility("default"))) FileStorageFactory class __attribute__((visibility("default"))) FileStorageFactory
: public KVStorageFactory { : public KVStorageFactory {
public: public:
// You have to make sure your APP have read and write permission of the path.
explicit FileStorageFactory(const std::string &path); explicit FileStorageFactory(const std::string &path);
~FileStorageFactory(); ~FileStorageFactory();
...@@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory ...@@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory
std::unique_ptr<Impl> impl_; std::unique_ptr<Impl> impl_;
}; };
// Set KV store factory used as OpenCL cache. (Call Once) // Set Key-Value store factory. (Call Once)
// Now KVStorage is used to store the built OpenCL binaries to file,
// which could speed up the GPU initialization and first run.
// If do not call this API, the initialization maybe slow for GPU.
__attribute__((visibility("default"))) __attribute__((visibility("default")))
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory); void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册