From ed70de8070073e8641b1846459e1b6ea9c78313a Mon Sep 17 00:00:00 2001 From: liyong Date: Tue, 28 Jul 2020 20:00:42 +0800 Subject: [PATCH] fix coredump when number of file list more than 1000. --- .../mindrecord/common/shard_pybind.cc | 1 + .../mindrecord/include/common/shard_utils.h | 3 +- .../mindrecord/include/shard_header.h | 2 +- .../minddata/mindrecord/io/shard_reader.cc | 4 +-- .../minddata/mindrecord/meta/shard_header.cc | 18 ++++++++--- mindspore/dataset/engine/datasets.py | 30 +++++++++---------- mindspore/dataset/engine/validators.py | 2 ++ 7 files changed, 37 insertions(+), 23 deletions(-) diff --git a/mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc b/mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc index d9e51efc4..f36182027 100644 --- a/mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc +++ b/mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc @@ -133,6 +133,7 @@ void BindGlobalParams(py::module *m) { (*m).attr("MAX_PAGE_SIZE") = kMaxPageSize; (*m).attr("MIN_SHARD_COUNT") = kMinShardCount; (*m).attr("MAX_SHARD_COUNT") = kMaxShardCount; + (*m).attr("MAX_FILE_COUNT") = kMaxFileCount; (*m).attr("MIN_CONSUMER_COUNT") = kMinConsumerCount; (void)(*m).def("get_max_thread_num", &GetMaxThreadNum); } diff --git a/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h b/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h index 198cd46cc..5ee6f70a8 100644 --- a/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h +++ b/mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h @@ -104,7 +104,8 @@ const uint64_t kInt64Len = 8; const uint64_t kMinFileSize = kInt64Len; const int kMinShardCount = 1; -const int kMaxShardCount = 1000; +const int kMaxShardCount = 1000; // write +const int kMaxFileCount = 4096; // read const int kMinConsumerCount = 1; const int kMaxConsumerCount = 128; diff --git a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h index 4aad0d117..51928d787 100644 --- a/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h +++ b/mindspore/ccsrc/minddata/mindrecord/include/shard_header.h @@ -152,7 +152,7 @@ class ShardHeader { MSRStatus CheckIndexField(const std::string &field, const json &schema); - void ParsePage(const json &page, int shard_index, bool load_dataset); + MSRStatus ParsePage(const json &page, int shard_index, bool load_dataset); MSRStatus ParseStatistics(const json &statistics); diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc index 84d7fddb6..37d7f19b6 100644 --- a/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc +++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc @@ -252,7 +252,7 @@ std::vector> ShardReader::ReadRowGroupSummar if (shard_count <= 0) { return row_group_summary; } - if (shard_count <= kMaxShardCount) { + if (shard_count <= kMaxFileCount) { for (int shard_id = 0; shard_id < shard_count; ++shard_id) { // return -1 when page's size equals to 0. auto last_page_id = shard_header_->GetLastPageId(shard_id); @@ -1054,7 +1054,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector(ret); auto local_columns = std::get<2>(ret); - if (shard_count_ <= kMaxShardCount) { + if (shard_count_ <= kMaxFileCount) { for (int shard_id = 0; shard_id < shard_count_; shard_id++) { for (uint32_t i = 0; i < offsets[shard_id].size(); i += 1) { tasks_.InsertTask(TaskType::kCommonTask, offsets[shard_id][i][0], offsets[shard_id][i][1], diff --git a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc index 843b412a3..f94f92d93 100644 --- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc +++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc @@ -55,7 +55,9 @@ MSRStatus ShardHeader::InitializeHeader(const std::vector &headers, bool l header_size_ = header["header_size"].get(); page_size_ = header["page_size"].get(); } - ParsePage(header["page"], shard_index, load_dataset); + if (SUCCESS != ParsePage(header["page"], shard_index, load_dataset)) { + return FAILED; + } shard_index++; } return SUCCESS; @@ -248,11 +250,16 @@ MSRStatus ShardHeader::ParseIndexFields(const json &index_fields) { return SUCCESS; } -void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) { +MSRStatus ShardHeader::ParsePage(const json &pages, int shard_index, bool load_dataset) { // set shard_index when load_dataset is false - if (pages_.empty() && shard_count_ <= kMaxShardCount) { + if (shard_count_ > kMaxFileCount) { + MS_LOG(ERROR) << "The number of mindrecord files is greater than max value: " << kMaxFileCount; + return FAILED; + } + if (pages_.empty() && shard_count_ <= kMaxFileCount) { pages_.resize(shard_count_); } + for (auto &page : pages) { int page_id = page["page_id"]; int shard_id = page["shard_id"]; @@ -275,6 +282,7 @@ void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_datase pages_[shard_index].push_back(std::move(parsed_page)); } } + return SUCCESS; } MSRStatus ShardHeader::ParseStatistics(const json &statistics) { @@ -715,7 +723,9 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) { std::string line; while (std::getline(page_in_handle, line)) { - ParsePage(json::parse(line), -1, true); + if (SUCCESS != ParsePage(json::parse(line), -1, true)) { + return FAILED; + } } page_in_handle.close(); diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index caf3857e2..d537fc3fb 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -1054,45 +1054,45 @@ class Dataset: * - type in 'dataset' - type in 'mindrecord' - detail - * - DE_BOOL + * - bool - None - Not support - * - DE_INT8 + * - int8 - int32 - - * - DE_UINT8 + * - uint8 - bytes(1D uint8) - Drop dimension - * - DE_INT16 + * - int16 - int32 - - * - DE_UINT16 + * - uint16 - int32 - - * - DE_INT32 + * - int32 - int32 - - * - DE_UINT32 + * - uint32 - int64 - - * - DE_INT64 + * - int64 - int64 - - * - DE_UINT64 + * - uint64 - None - Not support - * - DE_FLOAT16 - - Not support + * - float16 + - float32 - - * - DE_FLOAT32 + * - float32 - float32 - - * - DE_FLOAT64 + * - float64 - float64 - - * - DE_STRING + * - string - string - - Not support multi-dimensional DE_STRING + - Not support multi-dimensional string Note: 1. To save the samples in order, should set dataset's shuffle false and num_files 1. diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index a9a61c113..03f52380c 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -278,6 +278,8 @@ def check_minddataset(method): dataset_file = param_dict.get('dataset_file') if isinstance(dataset_file, list): + if len(dataset_file) > 4096: + raise ValueError("length of dataset_file should less than or equal to {}.".format(4096)) for f in dataset_file: check_file(f) else: -- GitLab