diff --git a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc index c44e956b3ee603d0b3c5011087787a5e4aceca35..b378d1ee3b20df8a853069986f50c5cb8fdb3918 100644 --- a/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc +++ b/mindspore/ccsrc/minddata/dataset/api/de_pipeline.cc @@ -410,6 +410,7 @@ Status DEPipeline::SaveDataset(const std::vector &file_names, const std::vector index_fields; s = FetchMetaFromTensorRow(column_name_id_map, row, &mr_json, &index_fields); RETURN_IF_NOT_OK(s); + MS_LOG(DEBUG) << "Schema of saved mindrecord: " << mr_json.dump(); if (mindrecord::SUCCESS != mindrecord::ShardHeader::initialize(&mr_header, mr_json, index_fields, blob_fields, mr_schema_id)) { RETURN_STATUS_UNEXPECTED("Error: failed to initialize ShardHeader."); @@ -569,6 +570,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_map mr_shape(shapes.begin(), shapes.end()); std::string el = column_type.ToString(); + dataset_schema[column_name] = el; if (mindrecord::kTypesMap.find(el) == mindrecord::kTypesMap.end()) { std::string err_msg("Error: can not support data type: " + el); RETURN_STATUS_UNEXPECTED(err_msg); @@ -605,6 +608,7 @@ Status DEPipeline::FetchMetaFromTensorRow(const std::unordered_mapemplace_back(column_name); // candidate of index fields } + MS_LOG(DEBUG) << "Schema of dataset: " << dataset_schema.dump(); return Status::OK(); } Status DEPipeline::BuildMindrecordSamplerChain(const py::handle &handle, diff --git a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc index 2f2aebf7f012f2a4df8b53f9b927344b2b1c3d60..84b0c5b37a174470b9bee42ed93ef4be22a2c978 100644 --- a/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc +++ b/mindspore/ccsrc/minddata/mindrecord/io/shard_writer.cc @@ -83,7 +83,7 @@ MSRStatus ShardWriter::OpenDataFiles(bool append) { // if not append and mindrecord file exist, return FAILED fs->open(common::SafeCStr(file), std::ios::in | std::ios::binary); if (fs->good()) { - MS_LOG(ERROR) << "MindRecord file already existed."; + MS_LOG(ERROR) << "MindRecord file already existed, please delete file: " << common::SafeCStr(file); fs->close(); return FAILED; } diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 10bc3b96a6ca5ab515149be9c2bcc502d172573e..bd0b66ab6c9f1895c9288389145585bdbdf77910 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -1041,12 +1041,61 @@ class Dataset: """ Save the dynamic data processed by dataset pipeline as common dataset format, support: mindrecord. + Implicit type casting exists when saving data as mindrecord. Table below shows how to do type casting. + + .. list-table:: Implicit Type Casting of Saving as mindrecord + :widths: 25 25 50 + :header-rows: 1 + + * - type in 'dataset' + - type in 'mindrecord' + - detail + * - DE_BOOL + - None + - Not support + * - DE_INT8 + - int32 + - + * - DE_UINT8 + - bytes(1D uint8) + - Drop dimension + * - DE_INT16 + - int32 + - + * - DE_UINT16 + - int32 + - + * - DE_INT32 + - int32 + - + * - DE_UINT32 + - int64 + - + * - DE_INT64 + - int64 + - + * - DE_UINT64 + - None + - Not support + * - DE_FLOAT16 + - Not support + - + * - DE_FLOAT32 + - float32 + - + * - DE_FLOAT64 + - float64 + - + * - DE_STRING + - string + - Not support multi-dimensional DE_STRING + Note: 1. To save the samples in order, should set dataset's shuffle false and num_files 1. 2. Before call the function, do not use batch, repeat operator or data augmentation operators with random attribute in map operator. - 3. Mindreocrd do not support np.uint64, multi-dimensional np.uint8(drop dimension) and - multi-dimensional string. + 3. Mindrecord does not support DE_UINT64, multi-dimensional DE_UINT8(drop dimension) and + multi-dimensional DE_STRING. Args: file_name (str): Path to dataset file.