提交 399200b3 编写于 作者: M Megvii Engine Team

perf(serialization): optimized the memory usage when load new format model

GitOrigin-RevId: 2b7313ebe39a7d4a44a8ab61fa0f3646fd7de566
上级 f31e52d5
......@@ -25,7 +25,11 @@ class OprParamsLoadContext final : public serialization::OprLoadContextRawPOD {
std::shared_ptr<HostTensorND> load_tensor() override { mgb_assert(0); }
std::shared_ptr<DeviceTensorND> load_tensor_shared() override { mgb_assert(0); }
std::shared_ptr<DeviceTensorND> load_tensor_shared(
bool copy_immediatly = false) override {
(void)copy_immediatly;
mgb_assert(0);
}
const serialization::GraphLoadConfig& config() const override { mgb_assert(0); }
......
......@@ -245,9 +245,6 @@ int LITE_destroy_network(LiteNetwork network) {
auto& global_holder = get_gloabl_network_holder();
if (global_holder.find(network) != global_holder.end()) {
global_holder.erase(network);
} else {
//! means the network has been destoryed
return -1;
}
LITE_CAPI_END();
}
......
......@@ -75,9 +75,6 @@ int LITE_destroy_tensor(LiteTensor tensor) {
auto& global_holder = get_global_tensor_holder();
if (global_holder.find(tensor) != global_holder.end()) {
global_holder.erase(tensor);
} else {
//! return -1, means the tensor has been destroyed.
return -1;
}
LITE_CAPI_END();
}
......
......@@ -16,26 +16,8 @@ void ModelLite::create_network() {
}
void ModelLite::load_model() {
if (share_model_mem) {
//! WARNNING:maybe not right to share param memmory for this
LITE_LOG("enable share model memory");
FILE* fin = fopen(model_path.c_str(), "rb");
LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
fseek(fin, 0, SEEK_END);
size_t size = ftell(fin);
fseek(fin, 0, SEEK_SET);
void* ptr = malloc(size);
std::shared_ptr<void> buf{ptr, free};
auto nr = fread(buf.get(), 1, size, fin);
LITE_ASSERT(nr == size, "read model file failed");
fclose(fin);
m_network->load_model(buf.get(), size);
} else {
m_network->load_model(model_path);
}
//! lite shared memory default
m_network->load_model(model_path);
}
void ModelLite::run_model() {
......
......@@ -128,7 +128,10 @@ std::shared_ptr<void> ModelParser::decrypt_memory(
const uint8_t* memory_ptr = data;
if (decryption_name == "NONE") {
result_length = length;
return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr), [](void*) {});
std::shared_ptr<uint8_t> shptr{
new uint8_t[length], [](uint8_t* p) { delete[] p; }};
memcpy(shptr.get(), data, length);
return shptr;
}
LITE_LOCK_GUARD(decryption_static_data().map_mutex);
auto it = decryption_static_data().decryption_methods.find(decryption_name);
......
......@@ -1032,7 +1032,7 @@ TEST(TestCapiNetWork, GlobalHolder) {
LITE_make_network(&c_network, *default_config(), *default_network_io()));
//! make sure destroy_network is destroyed by LITE_make_network
LITE_destroy_network(destroy_network);
ASSERT_EQ(LITE_destroy_network(destroy_network), -1);
ASSERT_EQ(LITE_destroy_network(destroy_network), 0);
LITE_CAPI_CHECK(LITE_destroy_network(c_network));
}
......
......@@ -328,7 +328,7 @@ TEST(TestCapiTensor, GlobalHolder) {
LITE_make_tensor(description, &c_tensor0);
//! make sure destroy_tensor is destroyed by LITE_make_tensor
LITE_destroy_tensor(destroy_tensor);
ASSERT_EQ(LITE_destroy_tensor(destroy_tensor), -1);
ASSERT_EQ(LITE_destroy_tensor(destroy_tensor), 0);
LITE_destroy_tensor(c_tensor0);
}
......
......@@ -332,6 +332,7 @@ class TensorND {
public:
using ChainReturnType = TensorND<TensorStorage>;
using Storage = TensorStorage;
MGE_WIN_DECLSPEC_FUC TensorND();
......
......@@ -443,38 +443,41 @@ void run<shape_dep_const_shape>(CompNode cn) {
HostTensorGenerator<> gen;
auto host_x = gen({4, 5}, cn);
auto fname = output_file("test_comp_node_record_shape_dep_const_shape");
auto test = [&](serialization::GraphDumpFormat format) {
HostTensorND y_expect;
{
// dump graph
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(
*graph, host_x, OperatorNodeConfig{"x"}),
y = x.flatten() +
opr::reduce_sum(opr::GetVarShape::make(x), x.make_scalar(1));
HostTensorND y_expect;
{
// dump graph
auto graph = ComputingGraph::make();
auto x = opr::Host2DeviceCopy::make(*graph, host_x, OperatorNodeConfig{"x"}),
y = x.flatten() +
opr::reduce_sum(opr::GetVarShape::make(x), x.make_scalar(1));
graph->compile({make_callback_copy(y, y_expect)})->execute();
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
dumper->dump({y});
}
graph->compile({make_callback_copy(y, y_expect)})->execute();
HostTensorND host_y;
{
GraphLoadConfig config;
config.const_var_shape = true;
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
auto load_rst = loader->load(config);
load_rst.graph->options().comp_node_seq_record_level = 2;
load_rst.graph->options().var_sanity_check_first_run = false;
auto x_inp = load_rst.tensor_map.at("x");
auto y = load_rst.output_var_list.at(0);
auto func = load_rst.graph_compile({make_callback_copy(y, host_y)});
x_inp->copy_from(*host_x);
func->execute();
}
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()), format);
dumper->dump({y});
}
MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
HostTensorND host_y;
{
GraphLoadConfig config;
config.const_var_shape = true;
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()), format);
auto load_rst = loader->load(config);
load_rst.graph->options().comp_node_seq_record_level = 2;
load_rst.graph->options().var_sanity_check_first_run = false;
auto x_inp = load_rst.tensor_map.at("x");
auto y = load_rst.output_var_list.at(0);
auto func = load_rst.graph_compile({make_callback_copy(y, host_y)});
x_inp->copy_from(*host_x);
func->execute();
}
MGB_ASSERT_TENSOR_EQ(y_expect, host_y);
};
test({});
test(serialization::GraphDumpFormat::FLATBUFFERS_V2);
}
//! single thread multi recorder run interleave
......
......@@ -367,16 +367,19 @@ MGB_DYN_TYPE_OBJ_FINAL_IMPL(ImmutableTensor);
class ImmutableTensor::Value {
MGB_MUTEX m_mtx;
DeviceTensorND m_dev, m_static_infer;
std::shared_ptr<DeviceTensorND> m_dev = std::make_shared<DeviceTensorND>();
DeviceTensorND m_static_infer;
std::string m_summary;
public:
void setup(CompNode cn, const HostTensorND& val);
bool initialized() const { return m_dev.shape_valid(); }
void setup(std::shared_ptr<DeviceTensorND> val);
bool initialized() const { return m_dev->shape_valid(); }
//! value on comp node
const DeviceTensorND& dev() const { return m_dev; }
const DeviceTensorND& dev() const { return *m_dev; }
//! get value on static infer CPU node
DeviceTensorND& static_infer();
......@@ -385,10 +388,17 @@ public:
const std::string& summary() const { return m_summary; }
};
void ImmutableTensor::Value::setup(std::shared_ptr<DeviceTensorND> val) {
mgb_assert(val);
m_dev = val;
m_summary = ssprintf("const%s", val->shape().to_string().c_str());
}
void ImmutableTensor::Value::setup(CompNode cn, const HostTensorND& val) {
mgb_assert(m_dev.empty() && !m_dev.shape_valid());
m_dev.comp_node(cn).copy_from(val).sync();
mgb_assert(val.empty() == m_dev.empty());
mgb_assert(m_dev->empty() && !m_dev->shape_valid());
m_dev->comp_node(cn).copy_from(val).sync();
mgb_assert(val.empty() == m_dev->empty());
auto one_elem = [](const TensorShape& shape) {
for (size_t i = 0; i < shape.ndim; ++i) {
......@@ -413,8 +423,8 @@ void ImmutableTensor::Value::setup(CompNode cn, const HostTensorND& val) {
DeviceTensorND& ImmutableTensor::Value::static_infer() {
MGB_LOCK_GUARD(m_mtx);
if (!m_static_infer.shape_valid()) {
mgb_assert(m_dev.shape_valid());
m_static_infer.comp_node(CompNode::default_cpu()).copy_from(m_dev);
mgb_assert(m_dev->shape_valid());
m_static_infer.comp_node(CompNode::default_cpu()).copy_from(*m_dev);
}
return m_static_infer;
}
......@@ -588,6 +598,19 @@ SymbolVar ImmutableTensor::make(
return make_from_value(graph, cache.get(val), {}, config);
}
SymbolVar ImmutableTensor::make(
ComputingGraph& graph, std::shared_ptr<DeviceTensorND> val,
const OperatorNodeConfig& config) {
auto cn = val->comp_node();
if (config.has_comp_node_set())
cn = config.get_single_comp_node();
auto value = std::make_shared<Value>();
value->setup(val);
return make_from_value(graph, *value, value, config);
}
SymbolVar ImmutableTensor::make(
ComputingGraph& graph, const DTypeScalar& val,
const OperatorNodeConfig& config) {
......
......@@ -132,8 +132,10 @@ struct OprLoadDumpImpl<opr::ImmutableTensor, 0> {
OprLoadContext& ctx, const cg::VarNodeArray& inputs,
const OperatorNodeConfig& config) {
mgb_assert(inputs.empty());
auto val = ctx.load_tensor();
return Opr::make(ctx.graph(), *val, config).node()->owner_opr();
//! because ImmutableTensor will used in infer shape or infer value,
//! so must copy immediatly
auto val = ctx.load_tensor_shared(true);
return Opr::make(ctx.graph(), val, config).node()->owner_opr();
}
};
......
......@@ -32,8 +32,10 @@ struct OprLoadDumpImplV2<opr::ImmutableTensor, 0> {
auto fopr = reinterpret_cast<const fbs::v2::Operator*>(
fbs_ctx.get_current_opr_data());
if (fopr->tensors() && fopr->tensors()->size() > 0) {
auto val = fbs_ctx.load_tensor();
return Opr::make(fbs_ctx.graph(), *val, config).node()->owner_opr();
//! because ImmutableTensor will used in infer shape or infer value,
//! so must copy immediatly
auto val = fbs_ctx.load_tensor_shared(true);
return Opr::make(fbs_ctx.graph(), val, config).node()->owner_opr();
} else {
mgb_throw(SerializationError, "ImmutableTensor load with no tensor data.");
}
......
......@@ -360,6 +360,10 @@ public:
ComputingGraph& graph, const HostTensorND& val,
const OperatorNodeConfig& config = {});
MGE_WIN_DECLSPEC_FUC static SymbolVar make(
ComputingGraph& graph, std::shared_ptr<DeviceTensorND> val,
const OperatorNodeConfig& config = {});
//! make from DTypeScalar; comp node must be provided in config
MGE_WIN_DECLSPEC_FUC static SymbolVar make(
ComputingGraph& graph, const DTypeScalar& val,
......
......@@ -138,6 +138,10 @@ public:
mgb_assert(m_refhold && size);
}
bool is_shared_memory() override { return true; }
bool writable() override { return m_writable; }
void have_modified() override { m_modified = true; }
void rewind() override {
if (m_modified) {
// data has beem modified; can not read again
......
......@@ -63,7 +63,11 @@ class OprLoadContextMemory final : public OprLoadContextRawPOD {
std::shared_ptr<HostTensorND> load_tensor() override { mgb_assert(0); }
std::shared_ptr<DeviceTensorND> load_tensor_shared() override { mgb_assert(0); }
std::shared_ptr<DeviceTensorND> load_tensor_shared(
bool copy_immediatly = false) override {
(void)copy_immediatly;
mgb_assert(0);
}
const GraphLoadConfig& config() const override {
mgb_throw(GraphError, "OprLoadContextMemory has no associated config");
......
......@@ -483,7 +483,8 @@ class GraphLoaderOSS::OprLoadContextImpl final : public OprLoadContextFlatBuffer
std::shared_ptr<HostTensorND> load_tensor() override;
std::shared_ptr<DeviceTensorND> load_tensor_shared() override;
std::shared_ptr<DeviceTensorND> load_tensor_shared(
bool copy_immediatly = false) override;
void load_single_opr(const fbs::Operator* opr);
......@@ -641,8 +642,8 @@ std::shared_ptr<HostTensorND> GraphLoaderOSS::OprLoadContextImpl::load_tensor()
return ret;
}
std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl::
load_tensor_shared() {
std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl::load_tensor_shared(
bool copy_immediatly) {
mgb_assert(
m_current_opr->tensors() &&
m_cur_opr_tensor_cnt < m_current_opr->tensors()->size());
......@@ -650,6 +651,9 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl::
auto comp_node = load_comp_node(tensor->comp_node());
auto layout = load_tensor_layout(tensor);
mgb_assert(tensor->data_size());
if (m_loader->m_shared_tensor_map.size() <= m_cur_shared_tensor_idx) {
m_loader->m_shared_tensor_map.resize(m_cur_shared_tensor_idx + 5);
}
auto&& sh_reg = m_loader->m_shared_tensor_map.at(m_cur_shared_tensor_idx++);
auto&& sh_ptr_ref = sh_reg.second[comp_node.mem_node()];
if (sh_ptr_ref) {
......@@ -673,6 +677,11 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl::
load_tensor_value(&hv, layout, tensor);
sh_ptr_ref = std::make_shared<DeviceTensorND>();
*sh_ptr_ref = DeviceTensorND::make_proxy(hv);
} else if (copy_immediatly) {
HostTensorND hv{CompNode::default_cpu()};
load_tensor_value(&hv, layout, tensor);
sh_ptr_ref = std::make_shared<DeviceTensorND>();
sh_ptr_ref->comp_node(comp_node).copy_from(hv).sync();
} else {
// use lazy load for non-CPU devices
HostTensorND hv{CompNode::default_cpu()};
......@@ -803,7 +812,7 @@ GraphLoader::LoadResult GraphLoaderOSS::OprLoadContextImpl::load_oprs() {
ret.output_var_map_id[out->original_id()] = var;
ret.output_var_list[i] = var;
}
mgb_assert(m_cur_shared_tensor_idx == m_loader->m_shared_tensor_map.size());
mgb_assert(m_cur_shared_tensor_idx <= m_loader->m_shared_tensor_map.size());
return ret;
}
......@@ -880,7 +889,7 @@ GraphLoader::LoadResult GraphLoaderOSS::load(const LoadConfig& config, bool rewi
if (m_shared_tensor_map.empty()) {
m_shared_tensor_map.resize(m_graph->nr_shared_tensor());
} else {
mgb_assert(m_shared_tensor_map.size() == m_graph->nr_shared_tensor());
mgb_assert(m_shared_tensor_map.size() >= m_graph->nr_shared_tensor());
}
OprLoadContextImpl ctx{this, m_graph->mgb_version()};
......
#if MGB_ENABLE_FBS_SERIALIZATION
#include <map>
#include "megbrain/comp_node_env.h"
#include "megbrain/opr/io.h"
#include "megbrain/serialization/helper.h"
......@@ -523,6 +524,77 @@ void GraphDumperOSSV2::dump_buf_with_len(const void* data, uint32_t size) {
}
// ----------------------------- Loader --------------------------------------
/**
* SharedTensorAlignMent will record all shared device tensors, at beginning, the
* tensor is not aligned, after all shared device tensor loaded, and the user
* provide memory will be wrote, and reorder all the tensor to aligned address
* ptr.
*/
class GraphLoaderOSSV2::SharedTensorAlignMent {
public:
SharedTensorAlignMent(SharedBuffer buffer, InputFile* file, bool is_enabled)
: m_enabled(is_enabled), m_file(file), m_model_buffer(buffer){};
bool add_device_tensor(std::shared_ptr<DeviceTensorND> tensor) {
if (!m_enabled)
return false;
if (tensor) {
m_device_tensors[reinterpret_cast<intptr_t>(tensor->raw_ptr())] = tensor;
return true;
}
return false;
}
/**
* record the tensor shared from the m_model_buffer, copy every tensor to
* the aligned address, then the model file will be modilfied, so it can't
* use again.
*/
bool reorder_and_align_tensor() {
if (!m_enabled)
return false;
bool modilfied = false;
intptr_t buffer_start = reinterpret_cast<intptr_t>(m_model_buffer.data());
intptr_t write_end = buffer_start;
for (auto& iter : m_device_tensors) {
auto& tensor = iter.second;
size_t tensor_size = tensor->layout().span().dist_byte();
size_t alignment = tensor->comp_node().get_mem_addr_alignment();
intptr_t tensor_start = reinterpret_cast<intptr_t>(tensor->raw_ptr());
intptr_t align_start = static_cast<intptr_t>(
reinterpret_cast<uintptr_t>(tensor->raw_ptr()) & ~(alignment - 1));
if (align_start > write_end) {
if (tensor_start != align_start) {
memmove(reinterpret_cast<void*>(align_start),
reinterpret_cast<void*>(tensor_start), tensor_size);
modilfied = true;
}
write_end = align_start + tensor_size;
DeviceTensorStorage storage;
auto raw_storage = std::shared_ptr<mgb::dt_byte>(
reinterpret_cast<mgb::dt_byte*>(align_start), [](void*) {});
storage.reset(tensor->comp_node(), tensor_size, raw_storage);
tensor->reset(storage, tensor->layout());
} else {
DeviceTensorND new_tensor(tensor->comp_node());
new_tensor.copy_from(*tensor).sync();
*tensor = new_tensor;
}
if (modilfied) {
m_file->have_modified();
}
}
return true;
}
private:
bool m_enabled = false;
InputFile* m_file;
SharedBuffer m_model_buffer;
std::map<intptr_t, std::shared_ptr<DeviceTensorND>> m_device_tensors;
};
CompNode GraphLoaderOSSV2::OprLoadContextImpl::load_comp_node(
const fbs::v2::CompNode* comp_node) {
mgb_assert(comp_node);
......@@ -596,7 +668,9 @@ std::shared_ptr<HostTensorND> GraphLoaderOSSV2::OprLoadContextImpl::load_tensor(
"serialization v2 format is pure flatbuffer format, not support "
"user tensor value loader callback.");
}
memcpy(ret->raw_ptr(), tensor->data()->data(), tensor->data()->size());
fill_tensor_memory(
*ret, tensor->data()->data(), tensor->data()->size(),
m_loader->m_file->is_shared_memory());
}
if (tensor->name()) {
m_tensor_map[tensor->name()->str()] = ret;
......@@ -612,7 +686,7 @@ std::shared_ptr<HostTensorND> GraphLoaderOSSV2::OprLoadContextImpl::load_tensor(
}
std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl::
load_tensor_shared() {
load_tensor_shared(bool copy_immediatly) {
mgb_assert(
m_current_opr->tensors() &&
m_cur_opr_tensor_cnt < m_current_opr->tensors()->size());
......@@ -620,6 +694,9 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl::
auto comp_node = load_comp_node(tensor->comp_node());
auto layout = load_tensor_layout(tensor, comp_node);
mgb_assert(tensor->data());
if (m_loader->m_shared_tensor_map.size() <= m_cur_shared_tensor_idx) {
m_loader->m_shared_tensor_map.resize(m_cur_shared_tensor_idx + 5);
}
auto&& shared_pair = m_loader->m_shared_tensor_map.at(m_cur_shared_tensor_idx++);
auto&& shared_tensor_ref = shared_pair.second[comp_node.mem_node()];
if (shared_tensor_ref) {
......@@ -637,19 +714,34 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl::
if (comp_node.mem_node() == CompNode::default_cpu().mem_node()) {
// directly forward CPU memory
shared_tensor_ref = std::make_shared<DeviceTensorND>();
HostTensorND hv{comp_node};
if (tensor->data() && tensor->data()->size() > 0) {
hv.dtype(layout.dtype).resize(layout);
memcpy(hv.raw_ptr(), tensor->data()->data(), tensor->data()->size());
fill_tensor_memory(
hv, tensor->data()->data(), tensor->data()->size(),
m_loader->m_file->is_shared_memory());
}
shared_tensor_ref = std::make_shared<DeviceTensorND>();
*shared_tensor_ref = DeviceTensorND::make_proxy(hv);
m_tensor_alignment->add_device_tensor(shared_tensor_ref);
} else if (copy_immediatly) {
HostTensorND hv{CompNode::default_cpu()};
shared_tensor_ref = std::make_shared<DeviceTensorND>();
if (tensor->data() && tensor->data()->size() > 0) {
hv.dtype(layout.dtype).resize(layout);
fill_tensor_memory(
hv, tensor->data()->data(), tensor->data()->size(),
m_loader->m_file->is_shared_memory());
}
shared_tensor_ref->comp_node(comp_node).copy_from(hv).sync();
} else {
// use lazy load for non-CPU devices
HostTensorND hv{CompNode::default_cpu()};
if (tensor->data() && tensor->data()->size() > 0) {
hv.dtype(layout.dtype).resize(layout);
memcpy(hv.raw_ptr(), tensor->data()->data(), tensor->data()->size());
fill_tensor_memory(
hv, tensor->data()->data(), tensor->data()->size(),
m_loader->m_file->is_shared_memory());
}
shared_tensor_ref = m_device_value_loader.make(comp_node, std::move(hv));
}
......@@ -784,7 +876,7 @@ GraphLoader::LoadResult GraphLoaderOSSV2::OprLoadContextImpl::load_oprs() {
ret.output_var_map_id[out->original_id()] = var;
ret.output_var_list[i] = var;
}
mgb_assert(m_cur_shared_tensor_idx == m_loader->m_shared_tensor_map.size());
mgb_assert(m_cur_shared_tensor_idx <= m_loader->m_shared_tensor_map.size());
return ret;
}
......@@ -808,7 +900,6 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re
m_file->read(&size, sizeof(size));
m_file->skip(-sizeof(size));
m_model_buf = m_file->read_shared(size + sizeof(size));
{
flatbuffers::Verifier verifier(
static_cast<const uint8_t*>(m_model_buf.data()), m_model_buf.size());
......@@ -838,8 +929,10 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re
} else {
mgb_assert(m_shared_tensor_map.size() == m_model->nr_shared_tensor());
}
OprLoadContextImpl ctx{this, m_model->mge_version()};
SharedTensorAlignMent tensor_alignment(
m_model_buf, m_file.get(),
m_file->writable() && m_file->is_shared_memory());
OprLoadContextImpl ctx{this, &tensor_alignment, m_model->mge_version()};
ctx.load_middle_tensor();
auto metadata = ctx.load_metadata();
auto result = ctx.load_oprs();
......@@ -856,6 +949,7 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re
}
}
m_model_loaded = true;
tensor_alignment.reorder_and_align_tensor();
result.graph_compile_ahead();
return result;
}
......
......@@ -41,6 +41,15 @@ public:
//! return current read offset
virtual size_t tell() = 0;
//! whether this file format support share memory when load model
virtual bool is_shared_memory() { return false; }
//! whether this can be write
virtual bool writable() { return false; }
//! whether this file have been wrote
virtual void have_modified() {}
/*!
* \brief read into a host tensor
*
......
......@@ -208,7 +208,8 @@ public:
*
* It must be dumped with TensorWriteMethod::VALUE_SHARED
*/
virtual std::shared_ptr<DeviceTensorND> load_tensor_shared() = 0;
virtual std::shared_ptr<DeviceTensorND> load_tensor_shared(
bool copy_immediatly = false) = 0;
//! get associated global configuration
virtual const GraphLoadConfig& config() const = 0;
......
......@@ -104,6 +104,7 @@ class GraphLoaderOSSV2 final : public GraphLoader {
public:
class OprLoadContextImpl;
class SharedTensorAlignMent;
friend class OprLoadContextImpl;
GraphLoaderOSSV2(std::unique_ptr<InputFile> input_file)
......@@ -136,22 +137,51 @@ class GraphLoaderOSSV2::OprLoadContextImpl final : public OprLoadContextFlatBuff
size_t m_cur_opr_tensor_cnt;
size_t m_cur_opr_blob_cnt;
size_t m_cur_opr_param_cnt;
SharedTensorAlignMent* m_tensor_alignment;
public:
friend class SharedTensorAlignMent;
ComputingGraph& graph() override { return *m_graph; }
const GraphLoadConfig& config() const override {
return *m_loader->m_cur_load_config;
}
//! shared or copy the loaded flatbuffer memory to the CPU tensor, this can reduce
//! the memory used when load model, but should consider the memory
//! alignment
void fill_tensor_memory(
HostTensorND& tensor, const uint8_t* data, size_t size, bool shared) {
auto tensor_size = tensor.layout().span().high_byte;
mgb_assert(
size == tensor_size,
"the size is not match when shared the flatbuffer memory\n");
auto ptr = reinterpret_cast<void*>(const_cast<uint8_t*>(data));
if (shared) {
HostTensorStorage storage;
auto raw_storage = std::shared_ptr<mgb::dt_byte>(
static_cast<mgb::dt_byte*>(ptr), [](void*) {});
storage.reset(tensor.comp_node(), size, raw_storage);
tensor.reset(storage, tensor.layout());
} else {
memcpy(tensor.raw_ptr(), data, size);
}
}
std::shared_ptr<HostTensorND> load_tensor() override;
std::shared_ptr<DeviceTensorND> load_tensor_shared() override;
std::shared_ptr<DeviceTensorND> load_tensor_shared(
bool copy_immediatly = false) override;
void load_single_opr(const fbs::v2::Operator* opr);
OprLoadContextImpl(GraphLoaderOSSV2* loader, uint32_t version)
: OprLoadContextFlatBuffers(version), m_loader{loader} {
OprLoadContextImpl(
GraphLoaderOSSV2* loader, SharedTensorAlignMent* tensor_alignment,
uint32_t version)
: OprLoadContextFlatBuffers(version),
m_loader{loader},
m_tensor_alignment(tensor_alignment) {
m_graph = loader->m_cur_load_config->comp_graph;
if (!m_graph) {
m_graph = ComputingGraph::make();
......
......@@ -315,7 +315,7 @@ void test_serializer_custom_loader(GraphDumpFormat format) {
load();
load();
ASSERT_EQ(2u, saved_val.size());
ASSERT_EQ(1, load_nr_null_ptr); // immutable tensor is not shared
ASSERT_EQ(2, load_nr_null_ptr); // immutable tensor is also shared
ASSERT_EQ(4, load_nr_call);
}
......@@ -482,10 +482,10 @@ void test_serializer_multiple_param(GraphDumpFormat format) {
ASSERT_THROW(loader->shared_tensor_id_map(), MegBrainError);
loader->load();
auto&& got = loader->shared_tensor_id_map();
ASSERT_EQ(values.size(), got.size());
ASSERT_EQ(2 * values.size(), got.size());
for (size_t i = 0; i < values.size(); ++i) {
ASSERT_EQ(1u, got[i].second.size());
auto &&vi = *values[i], &&gi = *got[i].second.begin()->second;
auto &&vi = *values[i], &&gi = *got[2 * i].second.begin()->second;
ASSERT_EQ(vi.shape(), gi.shape());
ASSERT_EQ(vi.comp_node(), gi.comp_node());
ASSERT_EQ(vi.dtype(), gi.dtype());
......@@ -565,7 +565,7 @@ void test_serializer_const_var_shape(GraphDumpFormat format) {
}
};
run_and_check(config);
ASSERT_EQ(2, nr_tensor);
ASSERT_EQ(1, nr_tensor); // immutable tensor is shared tensor
ASSERT_EQ(1, nr_mod);
}
}
......@@ -823,6 +823,77 @@ void test_serializer_log_exp(GraphDumpFormat format) {
load();
}
void test_serializer_memshare(GraphDumpFormat format) {
std::vector<uint8_t> buf;
HostTensorGenerator<> gen;
constexpr size_t SIZE = 127;
auto xval = gen({SIZE}, "cpu0"), bval = gen({1}, "cpu0");
auto dump = [&]() {
auto graph = ComputingGraph::make();
auto x0 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x0");
auto x1 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x1");
auto x2 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x2");
auto x3 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x3");
auto i4 = opr::ImmutableTensor::make(*graph, *xval).rename("i4");
auto i5 = opr::ImmutableTensor::make(*graph, *xval).rename("i5");
auto b = opr::SharedDeviceTensor::make(*graph, *bval).rename("b");
auto dumper = GraphDumper::make(OutputFile::make_vector_proxy(&buf), format);
dumper->dump({((x0 + x1) + b) + (x2 + x3) + i4 + i5, x0, i4});
};
HostTensorND expected;
expected.copy_from(*xval);
for (size_t i = 0; i < SIZE; ++i) {
auto&& v = expected.ptr<float>()[i];
v = v * 6 + bval->ptr<float>()[0];
}
std::vector<uint8_t> buf_al;
auto load = [&](bool share) {
std::unique_ptr<InputFile> fin;
if (share) {
buf_al.resize(buf.size());
memcpy(buf_al.data(), buf.data(), buf.size());
fin = InputFile::make_mem_proxy(
std::shared_ptr<void>{std::shared_ptr<void>{}, buf_al.data()},
buf.size());
} else {
fin = InputFile::make_mem_proxy(buf.data(), buf.size());
}
auto loader = GraphLoader::make(std::move(fin), format);
auto rst = loader->load();
auto x = rst.output_var_map.at("x0");
auto i4 = rst.output_var_map.at("i4");
auto&& opr = x.node()->owner_opr()->cast_final_safe<opr::SharedDeviceTensor>();
auto&& opr_imm =
i4.node()->owner_opr()->cast_final_safe<opr::ImmutableTensor>();
HostTensorND val;
auto func =
rst.graph_compile({make_callback_copy(rst.output_var_list[0], val)});
func->execute();
return std::make_pair(
val, std::vector<DeviceTensorND>{*opr.dev_data(), opr_imm.value()});
};
auto in_range = [](const std::vector<uint8_t>& buf, DeviceTensorND& dv) {
auto p0 = reinterpret_cast<uint8_t*>(dv.raw_ptr()),
p1 = reinterpret_cast<uint8_t*>(p0 + dv.layout().span().high_byte);
return buf.data() <= p0 && p1 <= buf.data() + buf.size();
};
for (bool share : {false, true}) {
buf.clear();
dump();
auto get = load(share);
MGB_ASSERT_TENSOR_EQ(*xval, HostTensorND{}.copy_from(get.second[0]).sync());
MGB_ASSERT_TENSOR_EQ(expected, get.first);
ASSERT_EQ(share, in_range(buf_al, get.second[0]));
ASSERT_EQ(share, in_range(buf_al, get.second[1]));
}
}
} // namespace
TEST(TestSerializer2, GraphDumpLoad) {
......@@ -967,6 +1038,10 @@ TEST(TestSerializer2, LOGEXPV2) {
test_serializer_log_exp(GraphDumpFormat::FLATBUFFERS_V2);
}
TEST(TestSerializer2, ShareMemv2) {
test_serializer_memshare(GraphDumpFormat::FLATBUFFERS_V2);
}
TEST(TestSerializer2, TestSoftMaxLoadDump) {
auto fname = GET_OUTPUT_FILE(GraphDumpFormat::FLATBUFFERS_V2);
TensorShape shape{2, 3};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册