提交 fc1c855e 编写于 作者: 李寅

Merge branch 'remove-unused-gpu-buffer' into 'master'

Remove unused tensor of GPU.

See merge request !665
......@@ -675,6 +675,12 @@ uint64_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
return size;
}
uint64_t OpenCLRuntime::GetDeviceMaxMemAllocSize() {
uint64_t size = 0;
device_->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
return size;
}
uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
uint64_t size = 0;
kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
......
......@@ -85,6 +85,7 @@ class OpenCLRuntime {
void GetCallStats(const cl::Event &event, CallStats *stats);
uint64_t GetDeviceMaxWorkGroupSize();
uint64_t GetDeviceMaxMemAllocSize();
uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
bool IsNonUniformWorkgroupsSupported() const;
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstdlib>
#include <random>
#include <string>
#include "mace/core/runtime_failure_mock.h"
......@@ -35,10 +35,12 @@ inline float GetRuntimeFailureRatioFromEnv() {
} // namespace
bool ShouldMockRuntimeFailure() {
static unsigned int seed = time(NULL);
static float mock_runtime_failure_ratio = GetRuntimeFailureRatioFromEnv();
if (mock_runtime_failure_ratio > 1e-6) {
float random_ratio = rand_r(&seed) / static_cast<float>(RAND_MAX);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0, 1.0);
float random_ratio = dis(gen);
if (random_ratio < mock_runtime_failure_ratio) {
VLOG(0) << "Mock runtime failure.";
return true;
......
......@@ -105,18 +105,21 @@ class Tensor {
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true),
unused_(false),
name_("") {}
Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype),
buffer_(buffer),
is_buffer_owner_(false),
unused_(false),
name_("") {}
Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false),
unused_(false),
name_("") {
buffer_ = &buffer_slice_;
}
......@@ -133,6 +136,8 @@ class Tensor {
inline void SetDtype(DataType dtype) { dtype_ = dtype; }
inline bool unused() const { return unused_; }
inline const std::vector<index_t> &shape() const { return shape_; }
inline index_t dim_size() const { return shape_.size(); }
......@@ -195,6 +200,10 @@ class Tensor {
return static_cast<T *>(buffer_->raw_mutable_data());
}
inline void MarkUnused() {
unused_ = true;
}
inline void Clear() {
MACE_CHECK_NOTNULL(buffer_);
buffer_->Clear(raw_size());
......@@ -362,6 +371,7 @@ class Tensor {
BufferBase *buffer_;
BufferSlice buffer_slice_;
bool is_buffer_owner_;
bool unused_;
std::string name_;
MACE_DISABLE_COPY_AND_ASSIGN(Tensor);
......
......@@ -18,6 +18,9 @@
#include <utility>
#include "mace/core/arg_helper.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/opencl_runtime.h"
#endif
#include "mace/core/workspace.h"
#include "mace/utils/timer.h"
......@@ -85,41 +88,82 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
VLOG(3) << "Model data size: " << model_data_size;
if (model_data_size > 0) {
if (type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type),
const_cast<unsigned char*>(model_data),
model_data_size));
} else {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type)));
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size);
tensor_buffer_->UnMap();
}
}
#ifdef MACE_ENABLE_OPENCL
if (type == DeviceType::GPU &&
OpenCLRuntime::Global()->GetDeviceMaxMemAllocSize() <=
static_cast<uint64_t>(model_data_size)) {
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(type),
const_tensor.data_type()));
tensor->Resize(dims);
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(), const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
MACE_CHECK(tensor->size() == const_tensor.data_size(),
"Tensor's data_size not equal with the shape");
MACE_CHECK(const_tensor.offset() + tensor->raw_size() <=
model_data_size,
"buffer offset + length (",
const_tensor.offset(),
" + ",
tensor->raw_size(),
") should <= ",
model_data_size);
tensor->CopyBytes(model_data + const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
tensor_map_[const_tensor.name()] = std::move(tensor);
}
} else {
#else
{
#endif
if (type == DeviceType::CPU) {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type),
const_cast<unsigned char*>(model_data),
model_data_size));
} else {
tensor_buffer_ = std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type)));
MACE_RETURN_IF_ERROR(tensor_buffer_->Allocate(model_data_size));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(const_cast<unsigned char*>(model_data),
0, model_data_size);
tensor_buffer_->UnMap();
}
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
<< ", data type: " << const_tensor.data_type() << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(
tensor_buffer_.get(), const_tensor.offset(),
const_tensor.data_size() *
GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
}
}
}
if (type == DeviceType::CPU || type == DeviceType::GPU) {
......@@ -232,4 +276,17 @@ ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
}
}
void Workspace::RemoveUnusedBuffer() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
while (iter != end_iter) {
auto old_iter = iter++;
if (old_iter->second->unused()) {
tensor_map_.erase(old_iter);
}
}
tensor_buffer_.reset(nullptr);
}
} // namespace mace
......@@ -53,6 +53,8 @@ class Workspace {
ScratchBuffer *GetScratchBuffer(DeviceType device_type);
void RemoveUnusedBuffer();
private:
MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
DeviceType device_type);
......
......@@ -180,6 +180,9 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
};
}
// Mark the buffer unused.
const_cast<Tensor *>(buffer)->MarkUnused();
return MACE_SUCCESS;
}
......
......@@ -222,6 +222,9 @@ MaceStatus MaceEngine::Impl::Init(
#ifdef MACE_ENABLE_HEXAGON
}
#endif
if (device_type_ == DeviceType::GPU) {
ws_->RemoveUnusedBuffer();
}
return MaceStatus::MACE_SUCCESS;
}
......@@ -240,7 +243,7 @@ MaceStatus MaceEngine::Impl::Init(
}
model_data_ = LoadModelData(model_data_file, model_data_size_);
Init(net_def, input_nodes, output_nodes, model_data_);
MACE_RETURN_IF_ERROR(Init(net_def, input_nodes, output_nodes, model_data_));
if (device_type_ == DeviceType::GPU || device_type_ == DeviceType::HEXAGON) {
UnloadModelData(model_data_, model_data_size_);
......
......@@ -66,6 +66,7 @@ class KVStorageFactory {
class __attribute__((visibility("default"))) FileStorageFactory
: public KVStorageFactory {
public:
// You have to make sure your APP have read and write permission of the path.
explicit FileStorageFactory(const std::string &path);
~FileStorageFactory();
......@@ -77,7 +78,10 @@ class __attribute__((visibility("default"))) FileStorageFactory
std::unique_ptr<Impl> impl_;
};
// Set KV store factory used as OpenCL cache. (Call Once)
// Set Key-Value store factory. (Call Once)
// Now KVStorage is used to store the built OpenCL binaries to file,
// which could speed up the GPU initialization and first run.
// If do not call this API, the initialization maybe slow for GPU.
__attribute__((visibility("default")))
void SetKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册