提交 716329df 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!5278 Change default value of sampler in c-api

Merge pull request !5278 from luoyang/c-api
......@@ -201,8 +201,8 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
}
// Function to create a ManifestDataset.
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage,
std::shared_ptr<SamplerObj> sampler,
std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const std::string &usage,
const std::shared_ptr<SamplerObj> &sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode) {
auto ds = std::make_shared<ManifestDataset>(dataset_file, usage, sampler, class_indexing, decode);
......@@ -590,13 +590,6 @@ bool SchemaObj::from_json(nlohmann::json json_obj) {
// OTHER FUNCTIONS
// Helper function to create default RandomSampler.
std::shared_ptr<SamplerObj> CreateDefaultSampler() {
const int32_t num_samples = 0; // 0 means to sample all ids.
bool replacement = false;
return std::make_shared<RandomSamplerObj>(replacement, num_samples);
}
// Helper function to compute a default shuffle size
Status ComputeShuffleSize(int64_t num_files, int64_t num_devices, int64_t num_rows, int64_t total_rows,
int64_t *shuffle_size) {
......@@ -692,6 +685,36 @@ bool ValidateDatasetShardParams(const std::string &dataset_name, int32_t num_sha
return true;
}
// Helper function to validate dataset sampler parameter
bool ValidateDatasetSampler(const std::string &dataset_name, const std::shared_ptr<SamplerObj> &sampler) {
if (sampler == nullptr) {
MS_LOG(ERROR) << dataset_name << ": Sampler is not constructed correctly, sampler: nullptr";
return false;
}
return true;
}
// Helper function to validate dataset input/output column parameter
bool ValidateDatasetColumnParam(const std::string &dataset_name, const std::string &column_param,
const std::vector<std::string> &columns) {
if (columns.empty()) {
MS_LOG(ERROR) << dataset_name << ":" << column_param << " should not be empty";
return false;
}
for (uint32_t i = 0; i < columns.size(); ++i) {
if (columns[i].empty()) {
MS_LOG(ERROR) << dataset_name << ":" << column_param << "[" << i << "] should not be empty";
return false;
}
}
std::set<std::string> columns_set(columns.begin(), columns.end());
if (columns_set.size() != columns.size()) {
MS_LOG(ERROR) << dataset_name << ":" << column_param << ": Every column name should not be same with others";
return false;
}
return true;
}
/* ####################################### Derived Dataset classes ################################# */
// DERIVED DATASET CLASSES LEAF-NODE DATASETS
......@@ -716,6 +739,16 @@ bool AlbumDataset::ValidateParams() {
return false;
}
if (!ValidateDatasetSampler("AlbumDataset", sampler_)) {
return false;
}
if (!column_names_.empty()) {
if (!ValidateDatasetColumnParam("AlbumDataset", "column_names", column_names_)) {
return false;
}
}
return true;
}
......@@ -724,11 +757,6 @@ std::vector<std::shared_ptr<DatasetOp>> AlbumDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler, i.e., RandomSampler.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->LoadSchemaFile(schema_path_, column_names_));
......@@ -754,6 +782,9 @@ bool CelebADataset::ValidateParams() {
if (!ValidateDatasetDirParam("CelebADataset", dataset_dir_)) {
return false;
}
if (!ValidateDatasetSampler("CelebADataset", sampler_)) {
return false;
}
std::set<std::string> dataset_type_list = {"all", "train", "valid", "test"};
auto iter = dataset_type_list.find(dataset_type_);
if (iter == dataset_type_list.end()) {
......@@ -768,11 +799,6 @@ std::vector<std::shared_ptr<DatasetOp>> CelebADataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(
schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kFlexible, 1)));
......@@ -789,18 +815,15 @@ std::vector<std::shared_ptr<DatasetOp>> CelebADataset::Build() {
Cifar10Dataset::Cifar10Dataset(const std::string &dataset_dir, std::shared_ptr<SamplerObj> sampler)
: dataset_dir_(dataset_dir), sampler_(sampler) {}
bool Cifar10Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_); }
bool Cifar10Dataset::ValidateParams() {
return ValidateDatasetDirParam("Cifar10Dataset", dataset_dir_) && ValidateDatasetSampler("Cifar10Dataset", sampler_);
}
// Function to build CifarOp for Cifar10
std::vector<std::shared_ptr<DatasetOp>> Cifar10Dataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
......@@ -818,18 +841,16 @@ std::vector<std::shared_ptr<DatasetOp>> Cifar10Dataset::Build() {
Cifar100Dataset::Cifar100Dataset(const std::string &dataset_dir, std::shared_ptr<SamplerObj> sampler)
: dataset_dir_(dataset_dir), sampler_(sampler) {}
bool Cifar100Dataset::ValidateParams() { return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_); }
bool Cifar100Dataset::ValidateParams() {
return ValidateDatasetDirParam("Cifar100Dataset", dataset_dir_) &&
ValidateDatasetSampler("Cifar100Dataset", sampler_);
}
// Function to build CifarOp for Cifar100
std::vector<std::shared_ptr<DatasetOp>> Cifar100Dataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
......@@ -1045,6 +1066,9 @@ bool CocoDataset::ValidateParams() {
if (!ValidateDatasetDirParam("CocoDataset", dataset_dir_)) {
return false;
}
if (!ValidateDatasetSampler("CocoDataset", sampler_)) {
return false;
}
Path annotation_file(annotation_file_);
if (!annotation_file.Exists()) {
MS_LOG(ERROR) << "annotation_file is invalid or not exist";
......@@ -1064,11 +1088,6 @@ std::vector<std::shared_ptr<DatasetOp>> CocoDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
CocoOp::TaskType task_type;
if (task_ == "Detection") {
task_type = CocoOp::TaskType::Detection;
......@@ -1158,6 +1177,12 @@ bool CSVDataset::ValidateParams() {
return false;
}
if (!column_names_.empty()) {
if (!ValidateDatasetColumnParam("CSVDataset", "column_names", column_names_)) {
return false;
}
}
return true;
}
......@@ -1218,17 +1243,15 @@ ImageFolderDataset::ImageFolderDataset(std::string dataset_dir, bool decode, std
class_indexing_(class_indexing),
exts_(extensions) {}
bool ImageFolderDataset::ValidateParams() { return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_); }
bool ImageFolderDataset::ValidateParams() {
return ValidateDatasetDirParam("ImageFolderDataset", dataset_dir_) &&
ValidateDatasetSampler("ImageFolderDataset", sampler_);
}
std::vector<std::shared_ptr<DatasetOp>> ImageFolderDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler, i.e., RandomSampler.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
// This arg is exist in ImageFolderOp, but not externalized (in Python API).
std::unique_ptr<DataSchema> schema = std::make_unique<DataSchema>();
......@@ -1243,7 +1266,8 @@ std::vector<std::shared_ptr<DatasetOp>> ImageFolderDataset::Build() {
return node_ops;
}
ManifestDataset::ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
ManifestDataset::ManifestDataset(const std::string &dataset_file, const std::string &usage,
const std::shared_ptr<SamplerObj> &sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode)
: dataset_file_(dataset_file), usage_(usage), decode_(decode), class_index_(class_indexing), sampler_(sampler) {}
......@@ -1254,6 +1278,10 @@ bool ManifestDataset::ValidateParams() {
return false;
}
if (!ValidateDatasetSampler("ManifestDataset", sampler_)) {
return false;
}
std::vector<std::string> usage_list = {"train", "eval", "inference"};
if (find(usage_list.begin(), usage_list.end(), usage_) == usage_list.end()) {
MS_LOG(ERROR) << "usage should be train, eval or inference.";
......@@ -1267,11 +1295,6 @@ std::vector<std::shared_ptr<DatasetOp>> ManifestDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
......@@ -1291,17 +1314,14 @@ std::vector<std::shared_ptr<DatasetOp>> ManifestDataset::Build() {
MnistDataset::MnistDataset(std::string dataset_dir, std::shared_ptr<SamplerObj> sampler)
: dataset_dir_(dataset_dir), sampler_(sampler) {}
bool MnistDataset::ValidateParams() { return ValidateDatasetDirParam("MnistDataset", dataset_dir_); }
bool MnistDataset::ValidateParams() {
return ValidateDatasetDirParam("MnistDataset", dataset_dir_) && ValidateDatasetSampler("MnistDataset", sampler_);
}
std::vector<std::shared_ptr<DatasetOp>> MnistDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler, i.e., RandomSampler.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
// Do internal Schema generation.
auto schema = std::make_unique<DataSchema>();
RETURN_EMPTY_IF_ERROR(schema->AddColumn(ColDescriptor("image", DataType(DataType::DE_UINT8), TensorImpl::kCv, 1)));
......@@ -1320,6 +1340,14 @@ bool RandomDataset::ValidateParams() {
MS_LOG(ERROR) << "RandomDataset: total_rows must be greater than 0, now get " << total_rows_;
return false;
}
if (!ValidateDatasetSampler("RandomDataset", sampler_)) {
return false;
}
if (!columns_list_.empty()) {
if (!ValidateDatasetColumnParam("RandomDataset", "columns_list", columns_list_)) {
return false;
}
}
return true;
}
......@@ -1342,11 +1370,6 @@ std::vector<std::shared_ptr<DatasetOp>> RandomDataset::Build() {
total_rows_ = schema_obj->get_num_rows();
}
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
std::string schema_json_string, schema_file_path;
if (schema_ != nullptr) {
schema_->set_dataset_type("Random");
......@@ -1459,6 +1482,9 @@ bool VOCDataset::ValidateParams() {
MS_LOG(ERROR) << "Invalid dataset path or no dataset path is specified.";
return false;
}
if (!ValidateDatasetSampler("VOCDataset", sampler_)) {
return false;
}
if (task_ == "Segmentation") {
if (!class_index_.empty()) {
MS_LOG(ERROR) << "class_indexing is invalid in Segmentation task.";
......@@ -1487,11 +1513,6 @@ std::vector<std::shared_ptr<DatasetOp>> VOCDataset::Build() {
// A vector containing shared pointer to the Dataset Ops that this object will create
std::vector<std::shared_ptr<DatasetOp>> node_ops;
// If user does not specify Sampler, create a default sampler based on the shuffle variable.
if (sampler_ == nullptr) {
sampler_ = CreateDefaultSampler();
}
auto schema = std::make_unique<DataSchema>();
VOCOp::TaskType task_type_;
......@@ -1657,7 +1678,21 @@ bool MapDataset::ValidateParams() {
MS_LOG(ERROR) << "Map: No operation is specified.";
return false;
}
if (!input_columns_.empty()) {
if (!ValidateDatasetColumnParam("MapDataset", "input_columns", input_columns_)) {
return false;
}
}
if (!output_columns_.empty()) {
if (!ValidateDatasetColumnParam("MapDataset", "output_columns", output_columns_)) {
return false;
}
}
if (!project_columns_.empty()) {
if (!ValidateDatasetColumnParam("MapDataset", "project_columns", project_columns_)) {
return false;
}
}
return true;
}
......@@ -1686,23 +1721,13 @@ RenameDataset::RenameDataset(const std::vector<std::string> &input_columns,
: input_columns_(input_columns), output_columns_(output_columns) {}
bool RenameDataset::ValidateParams() {
if (input_columns_.empty() || output_columns_.empty()) {
MS_LOG(ERROR) << "input and output columns must be specified";
return false;
}
if (input_columns_.size() != output_columns_.size()) {
MS_LOG(ERROR) << "input and output columns must be the same size";
MS_LOG(ERROR) << "RenameDataset: input and output columns must be the same size";
return false;
}
for (uint32_t i = 0; i < input_columns_.size(); ++i) {
if (input_columns_[i].empty()) {
MS_LOG(ERROR) << "input_columns: column name should not be empty.";
return false;
}
if (output_columns_[i].empty()) {
MS_LOG(ERROR) << "output_columns: column name should not be empty.";
return false;
}
if (!ValidateDatasetColumnParam("RenameDataset", "input_columns", input_columns_) ||
!ValidateDatasetColumnParam("RenameDataset", "output_columns", output_columns_)) {
return false;
}
return true;
}
......
......@@ -87,44 +87,44 @@ std::shared_ptr<SchemaObj> Schema(const std::string &schema_file = "");
/// \param[in] column_names Column names used to specify columns to load, if empty, will read all columns.
/// (default = {})
/// \param[in] decode the option to decode the images in dataset (default = false)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset (default = nullptr)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
std::shared_ptr<AlbumDataset> Album(const std::string &dataset_dir, const std::string &data_schema,
const std::vector<std::string> &column_names = {}, bool decode = false,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a CelebADataset
/// \notes The generated dataset has two columns ['image', 'attr'].
// The type of the image tensor is uint8. The attr tensor is uint32 and one hot type.
/// \param[in] dataset_dir Path to the root directory that contains the dataset.
/// \param[in] dataset_type One of 'all', 'train', 'valid' or 'test'.
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \param[in] decode Decode the images after reading (default=false).
/// \param[in] extensions Set of file extensions to be included in the dataset (default={}).
/// \return Shared pointer to the current Dataset
std::shared_ptr<CelebADataset> CelebA(const std::string &dataset_dir, const std::string &dataset_type = "all",
const std::shared_ptr<SamplerObj> &sampler = nullptr, bool decode = false,
const std::shared_ptr<SamplerObj> &sampler = RandomSampler(), bool decode = false,
const std::set<std::string> &extensions = {});
/// \brief Function to create a Cifar10 Dataset
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar10Dataset> Cifar10(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a Cifar100 Dataset
/// \notes The generated dataset has three columns ['image', 'coarse_label', 'fine_label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
std::shared_ptr<Cifar100Dataset> Cifar100(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a CLUEDataset
/// \notes The generated dataset has a variable number of columns depending on the task and usage
......@@ -161,12 +161,12 @@ std::shared_ptr<CLUEDataset> CLUE(const std::vector<std::string> &dataset_files,
/// \param[in] annotation_file Path to the annotation json
/// \param[in] task Set the task type of reading coco data, now support 'Detection'/'Stuff'/'Panoptic'/'Keypoint'
/// \param[in] decode Decode the images after reading
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
std::shared_ptr<CocoDataset> Coco(const std::string &dataset_dir, const std::string &annotation_file,
const std::string &task = "Detection", const bool &decode = false,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a CSVDataset
/// \notes The generated dataset has a variable number of columns
......@@ -200,13 +200,13 @@ std::shared_ptr<CSVDataset> CSV(const std::vector<std::string> &dataset_files, c
/// The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] decode A flag to decode in ImageFolder
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \param[in] extensions File extensions to be read
/// \param[in] class_indexing a class name to label map
/// \return Shared pointer to the current ImageFolderDataset
std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir, bool decode = false,
const std::shared_ptr<SamplerObj> &sampler = nullptr,
const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
const std::set<std::string> &extensions = {},
const std::map<std::string, int32_t> &class_indexing = {});
......@@ -214,25 +214,25 @@ std::shared_ptr<ImageFolderDataset> ImageFolder(const std::string &dataset_dir,
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_file The dataset file to be read
/// \param[in] usage Need "train", "eval" or "inference" data (default="train")
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \param[in] class_indexing A str-to-int mapping from label name to index (default={}, the folder
/// names will be sorted alphabetically and each class will be given a unique index starting from 0).
/// \param[in] decode Decode the images after reading (default=false).
/// \return Shared pointer to the current ManifestDataset
std::shared_ptr<ManifestDataset> Manifest(std::string dataset_file, std::string usage = "train",
std::shared_ptr<SamplerObj> sampler = nullptr,
std::shared_ptr<ManifestDataset> Manifest(const std::string &dataset_file, const std::string &usage = "train",
const std::shared_ptr<SamplerObj> &sampler = RandomSampler(),
const std::map<std::string, int32_t> &class_indexing = {},
bool decode = false);
/// \brief Function to create a MnistDataset
/// \notes The generated dataset has two columns ['image', 'label']
/// \param[in] dataset_dir Path to the root directory that contains the dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`,
/// A `RandomSampler` will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current MnistDataset
std::shared_ptr<MnistDataset> Mnist(const std::string &dataset_dir,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a ConcatDataset
/// \notes Reload "+" operator to concat two datasets
......@@ -246,14 +246,14 @@ std::shared_ptr<ConcatDataset> operator+(const std::shared_ptr<Dataset> &dataset
/// \param[in] total_rows Number of rows for the dataset to generate (default=0, number of rows is random)
/// \param[in] schema SchemaObj to set column type, data type and data shape
/// \param[in] columns_list List of columns to be read (default={}, read all columns)
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
template <typename T = std::shared_ptr<SchemaObj>>
std::shared_ptr<RandomDataset> RandomData(const int32_t &total_rows = 0, T schema = nullptr,
const std::vector<std::string> &columns_list = {},
std::shared_ptr<SamplerObj> sampler = nullptr) {
auto ds = std::make_shared<RandomDataset>(total_rows, schema, std::move(columns_list), std::move(sampler));
const std::shared_ptr<SamplerObj> &sampler = RandomSampler()) {
auto ds = std::make_shared<RandomDataset>(total_rows, schema, columns_list, std::move(sampler));
return ds->ValidateParams() ? ds : nullptr;
}
......@@ -286,13 +286,13 @@ std::shared_ptr<TextFileDataset> TextFile(const std::vector<std::string> &datase
/// \param[in] mode Set the data list txt file to be readed
/// \param[in] class_indexing A str-to-int mapping from label name to index
/// \param[in] decode Decode the images after reading
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is `nullptr`, A `RandomSampler`
/// will be used to randomly iterate the entire dataset
/// \param[in] sampler Object used to choose samples from the dataset. If sampler is not given,
/// a `RandomSampler` will be used to randomly iterate the entire dataset (default = RandomSampler())
/// \return Shared pointer to the current Dataset
std::shared_ptr<VOCDataset> VOC(const std::string &dataset_dir, const std::string &task = "Segmentation",
const std::string &mode = "train",
const std::map<std::string, int32_t> &class_indexing = {}, bool decode = false,
const std::shared_ptr<SamplerObj> &sampler = nullptr);
const std::shared_ptr<SamplerObj> &sampler = RandomSampler());
/// \brief Function to create a ZipDataset
/// \notes Applies zip to the dataset
......@@ -756,7 +756,7 @@ class ImageFolderDataset : public Dataset {
class ManifestDataset : public Dataset {
public:
/// \brief Constructor
ManifestDataset(std::string dataset_file, std::string usage, std::shared_ptr<SamplerObj> sampler,
ManifestDataset(const std::string &dataset_file, const std::string &usage, const std::shared_ptr<SamplerObj> &sampler,
const std::map<std::string, int32_t> &class_indexing, bool decode);
/// \brief Destructor
......@@ -808,7 +808,7 @@ class RandomDataset : public Dataset {
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::shared_ptr<SchemaObj> schema,
const std::vector<std::string> &columns_list, std::shared_ptr<SamplerObj> sampler)
const std::vector<std::string> &columns_list, const std::shared_ptr<SamplerObj> &sampler)
: total_rows_(total_rows),
schema_path_(""),
schema_(std::move(schema)),
......@@ -816,8 +816,8 @@ class RandomDataset : public Dataset {
sampler_(std::move(sampler)) {}
/// \brief Constructor
RandomDataset(const int32_t &total_rows, std::string schema_path, std::vector<std::string> columns_list,
std::shared_ptr<SamplerObj> sampler)
RandomDataset(const int32_t &total_rows, std::string schema_path, const std::vector<std::string> &columns_list,
const std::shared_ptr<SamplerObj> &sampler)
: total_rows_(total_rows), schema_path_(schema_path), columns_list_(columns_list), sampler_(std::move(sampler)) {}
/// \brief Destructor
......
......@@ -93,7 +93,7 @@ TEST_F(MindDataTestPipeline, TestAlbumDecode) {
TEST_F(MindDataTestPipeline, TestAlbumNumSamplers) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumNumSamplers.";
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
......@@ -134,3 +134,25 @@ TEST_F(MindDataTestPipeline, TestAlbumError) {
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestAlbumWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumWithNullSampler.";
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "label", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestAlbumDuplicateColumnName) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAlbumDuplicateColumnName.";
std::string folder_path = datasets_root_path_ + "/testAlbum/images";
std::string schema_file = datasets_root_path_ + "/testAlbum/datasetSchema.json";
std::vector<std::string> column_names = {"image", "image", "id"};
// Create a Album Dataset
std::shared_ptr<Dataset> ds = Album(folder_path, schema_file, column_names, true);
// Expect failure: duplicate column names
EXPECT_EQ(ds, nullptr);
}
......@@ -107,3 +107,33 @@ TEST_F(MindDataTestPipeline, TestCifar10DatasetFail1) {
std::shared_ptr<Dataset> ds = Cifar10("", RandomSampler(false, 10));
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestCifar10DatasetWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar10DatasetWithNullSampler.";
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar10Data/";
std::shared_ptr<Dataset> ds = Cifar10(folder_path, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestCifar100DatasetWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithNullSampler.";
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = Cifar100(folder_path, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestCifar100DatasetWithWrongSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCifar100DatasetWithWrongSampler.";
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = Cifar100(folder_path, RandomSampler(false, -10));
// Expect failure: sampler is not construnced correctly
EXPECT_EQ(ds, nullptr);
}
......@@ -290,3 +290,14 @@ TEST_F(MindDataTestPipeline, TestCocoStuff) {
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestCocoWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCocoWithNullSampler.";
// Create a Coco Dataset
std::string folder_path = datasets_root_path_ + "/testCOCO/train";
std::string annotation_file = datasets_root_path_ + "/testCOCO/annotations/train.json";
std::shared_ptr<Dataset> ds = Coco(folder_path, annotation_file, "Detection", false, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
......@@ -533,3 +533,14 @@ TEST_F(MindDataTestPipeline, TestCSVDatasetShuffleGlobal) {
GlobalContext::config_manager()->set_seed(original_seed);
GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
}
TEST_F(MindDataTestPipeline, TestCSVDatasetDuplicateColumnName) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCSVDatasetDuplicateColumnName.";
// Create a CSVDataset, with single CSV file
std::string train_file = datasets_root_path_ + "/testCSV/1.csv";
std::vector<std::string> column_names = {"col1", "col1", "col3", "col4"};
std::shared_ptr<Dataset> ds = CSV({train_file}, ',', {}, column_names, -1, ShuffleMode::kFalse);
// Expect failure: duplicate column names
EXPECT_EQ(ds, nullptr);
}
......@@ -59,7 +59,7 @@ TEST_F(MindDataTestPipeline, TestManifestDecode) {
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, {}, true);
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", RandomSampler(), {}, true);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
......@@ -130,7 +130,7 @@ TEST_F(MindDataTestPipeline, TestManifestClassIndex) {
std::vector<int> expected_label = {111, 222};
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr, map, true);
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", RandomSampler(), map, true);
EXPECT_NE(ds, nullptr);
// Create an iterator over the result of the above dataset
......@@ -204,3 +204,12 @@ TEST_F(MindDataTestPipeline, TestManifestError) {
std::shared_ptr<Dataset> ds1 = Manifest(file_path, "invalid_usage");
EXPECT_EQ(ds1, nullptr);
}
TEST_F(MindDataTestPipeline, TestManifestWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestManifestWithNullSampler.";
std::string file_path = datasets_root_path_ + "/testManifestData/cpp.json";
// Create a Manifest Dataset
std::shared_ptr<Dataset> ds = Manifest(file_path, "train", nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
......@@ -311,6 +311,34 @@ TEST_F(MindDataTestPipeline, TestProjectMap) {
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestMapDuplicateColumn) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMapDuplicateColumn.";
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, RandomSampler(false, 10));
EXPECT_NE(ds, nullptr);
// Create objects for the tensor ops
std::shared_ptr<TensorOperation> random_vertical_flip_op = vision::RandomVerticalFlip(0.5);
EXPECT_NE(random_vertical_flip_op, nullptr);
// Create a Map operation on ds
auto ds1 = ds->Map({random_vertical_flip_op}, {"image", "image"}, {}, {});
// Expect failure: duplicate input column name
EXPECT_EQ(ds1, nullptr);
// Create a Map operation on ds
auto ds2 = ds->Map({random_vertical_flip_op}, {}, {"label", "label"}, {});
// Expect failure: duplicate output column name
EXPECT_EQ(ds2, nullptr);
// Create a Map operation on ds
auto ds3 = ds->Map({random_vertical_flip_op}, {}, {}, {"image", "image"});
// Expect failure: duplicate project column name
EXPECT_EQ(ds3, nullptr);
}
TEST_F(MindDataTestPipeline, TestProjectMapAutoInjection) {
MS_LOG(INFO) << "Doing MindDataTestPipeline.TestProjectMapAutoInjection";
......@@ -395,6 +423,24 @@ TEST_F(MindDataTestPipeline, TestRenameFail2) {
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestRenameFail3) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameFail3.";
// We expect this test to fail because duplicate column name
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, RandomSampler(false, 10));
EXPECT_NE(ds, nullptr);
// Create a Rename operation on ds
auto ds1 = ds->Rename({"image", "image"}, {"col1", "col2"});
EXPECT_EQ(ds1, nullptr);
// Create a Rename operation on ds
auto ds2 = ds->Rename({"image", "label"}, {"col1", "col1"});
EXPECT_EQ(ds2, nullptr);
}
TEST_F(MindDataTestPipeline, TestRenameSuccess) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRenameSuccess.";
......
......@@ -265,4 +265,28 @@ TEST_F(MindDataTestPipeline, TestRandomDatasetBasic4) {
// Manually terminate the pipeline
iter->Stop();
GlobalContext::config_manager()->set_seed(curr_seed);
}
\ No newline at end of file
}
TEST_F(MindDataTestPipeline, TestRandomDatasetWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetWithNullSampler.";
// Create a RandomDataset
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
std::shared_ptr<Dataset> ds = RandomData(50, schema, {}, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestRandomDatasetDuplicateColumnName) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRandomDatasetDuplicateColumnName.";
// Create a RandomDataset
std::shared_ptr<SchemaObj> schema = Schema();
schema->add_column("image", mindspore::TypeId::kNumberTypeUInt8, {2});
schema->add_column("label", mindspore::TypeId::kNumberTypeUInt8, {1});
std::shared_ptr<Dataset> ds = RandomData(50, schema, {"image", "image"});
// Expect failure: duplicate column names
EXPECT_EQ(ds, nullptr);
}
......@@ -194,3 +194,13 @@ TEST_F(MindDataTestPipeline, TestVOCSegmentationError1) {
// Expect nullptr for segmentation task with class_index
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestVOCWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVOCWithNullSampler.";
// Create a VOC Dataset
std::string folder_path = datasets_root_path_ + "/testVOC2012_2";
std::shared_ptr<Dataset> ds = VOC(folder_path, "Segmentation", "train", {}, false, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
......@@ -118,24 +118,44 @@ TEST_F(MindDataTestPipeline, TestCelebAException) {
EXPECT_EQ(ds1, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail1.";
TEST_F(MindDataTestPipeline, TestCelebADatasetWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCelebADataset.";
// Create an ImageFolder Dataset
std::shared_ptr<Dataset> ds = ImageFolder("", true, nullptr);
// Create a CelebA Dataset
std::string folder_path = datasets_root_path_ + "/testCelebAData/";
std::shared_ptr<Dataset> ds = CelebA(folder_path, "all", nullptr, false, {});
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestMnistFail1) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFail1.";
TEST_F(MindDataTestPipeline, TestMnistFailWithWrongDatasetDir) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithWrongDatasetDir.";
// Create a Mnist Dataset
std::shared_ptr<Dataset> ds = Mnist("", RandomSampler(false, 10));
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderFail2) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFail2.";
TEST_F(MindDataTestPipeline, TestMnistFailWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestMnistFailWithNullSampler.";
// Create a Mnist Dataset
std::string folder_path = datasets_root_path_ + "/testMnistData/";
std::shared_ptr<Dataset> ds = Mnist(folder_path, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderWithWrongDatasetDir) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderWithWrongDatasetDir.";
// Create an ImageFolder Dataset
std::shared_ptr<Dataset> ds = ImageFolder("", true, nullptr);
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongExtension) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongExtension.";
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
......@@ -150,8 +170,29 @@ TEST_F(MindDataTestPipeline, TestImageFolderFail2) {
// Iterate the dataset and get each row
std::unordered_map<std::string, std::shared_ptr<Tensor>> row;
iter->GetNextRow(&row);
// Expect no data: can not find files with specified extension
EXPECT_EQ(row.size(), 0);
// Manually terminate the pipeline
iter->Stop();
}
TEST_F(MindDataTestPipeline, TestImageFolderFailWithNullSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithNullSampler.";
// Create an ImageFolder Dataset
std::string folder_path = datasets_root_path_ + "/testPK/data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, nullptr);
// Expect failure: sampler can not be nullptr
EXPECT_EQ(ds, nullptr);
}
TEST_F(MindDataTestPipeline, TestImageFolderFailWithWrongSampler) {
MS_LOG(INFO) << "Doing MindDataTestPipeline-TestImageFolderFailWithWrongSampler.";
// Create a Cifar10 Dataset
std::string folder_path = datasets_root_path_ + "/testCifar100Data/";
std::shared_ptr<Dataset> ds = ImageFolder(folder_path, true, SequentialSampler(-2, 5));
// Expect failure: sampler is not construnced correctly
EXPECT_EQ(ds, nullptr);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册